Example development notebook for app_predict

Related wiki page:
https://github.com/stharrold/demo/wiki/app_predict

In [1]:
cd /opt/demo
/opt/demo
In [2]:
# Import standard packages.
import os
import sys
import time
# Import installed packages.
# Import local packages.
sys.path.insert(0, os.path.join(os.path.curdir, r'demo'))
%reload_ext autoreload
%autoreload 2
import demo
%matplotlib inline
/opt/conda/lib/python3.5/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
/opt/conda/lib/python3.5/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
In [3]:
print("Timestamp:")
print(time.strftime(r'%Y-%m-%dT%H:%M:%S%Z', time.gmtime()))
print()
print("Versions:")
print("Python:", sys.version_info)
print("demo:", demo.__version__)
Timestamp:
2017-03-01T19:29:31GMT

Versions:
Python: sys.version_info(major=3, minor=5, micro=2, releaselevel='final', serial=0)
demo: 0.0.0
In [4]:
!py.test -v --pdb
============================= test session starts ==============================
platform linux -- Python 3.5.2, pytest-2.9.2, py-1.4.31, pluggy-0.3.1 -- /opt/conda/bin/python
cachedir: .cache
rootdir: /opt/demo, inifile: 
collected 8 items 

tests/test__init__.py::test__all__ PASSED
tests/test__init__.py::test__version__ PASSED
tests/test_app_template/test_app_template__init__.py::test__all__ PASSED
tests/test_app_template/test_app_template_main.py::test__all__ PASSED
tests/test_app_template/test_app_template_main.py::test_main PASSED
tests/test_app_template/test_app_template_main.py::test__main__ PASSED
tests/test_app_template/test_template.py::test_prepend_this PASSED
tests/test_utils/test_utils__init__.py::test__all__ PASSED

=========================== 8 passed in 5.56 seconds ===========================

Development below

Initialization

Imports

In [1]:
cd /opt/demo
/opt/demo
In [2]:
# Import standard packages.
import bs4
import collections
import gc
import glob
import io
import itertools
import logging
import os
import pickle
import requests
import shelve
import subprocess
import sys
import textwrap
import time
import warnings
# Import installed packages.
from IPython.display import display, SVG
import geopy
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
import sklearn as sk
import sklearn.cross_validation as sk_cv
import sklearn.cluster as sk_cl
import sklearn.decomposition as sk_dc
import sklearn.ensemble as sk_ens
import sklearn.metrics as sk_met
import sklearn.preprocessing as sk_pre
import sklearn.tree as sk_tr
# Import local packages.
# datascience version: TODO link to GitHub tag v0.0.1.
sys.path.insert(0, os.path.join(os.path.curdir, r'demo'))
%reload_ext autoreload
%autoreload 2
import demo
%matplotlib inline
In [3]:
print("Timestamp:")
print(time.strftime(r'%Y-%m-%dT%H:%M:%S%Z', time.gmtime()))
print()
print("Versions:")
print("Python:", sys.version_info)
print("matplotlib:", mpl.__version__)
print("numpy:", np.__version__)
print("pandas:", pd.__version__)
print("scipy:", scipy.__version__)
print("sklearn:", sk.__version__)
print("seaborn:", sns.__version__)
print("demo:", demo.__version__)
Timestamp:
2017-03-02T10:00:03GMT

Versions:
Python: sys.version_info(major=3, minor=5, micro=2, releaselevel='final', serial=0)
matplotlib: 1.5.1
numpy: 1.11.1
pandas: 0.18.1
scipy: 0.17.1
sklearn: 0.17.1
seaborn: 0.7.1
demo: 0.0.0

Globals

In [4]:
# State settings.
sns.set() # Set matplot lib styles with seaborn
np.random.seed(seed=1) # Set random state for reproducibility.

# File paths
path_data_dir = os.path.join(os.path.abspath(os.path.curdir), r'demo/app_predict/data')
path_csv = os.path.join(path_data_dir, r'Risky Dealer Case Study Transactions.csv')

# Statistics
# percentiles = [-1.6449 sigma, -1 sigma, mean, +1 sigma, +1.6449 sigma] for normal distribution
# 90.00% confidence interval = (percentiles[0], percentiles[-1])
# 68.27% confidence interval = (percentiles[1], percentiles[-2])
# median = percentiles[2]
percentiles = [0.0500, 0.1587, 0.5000, 0.8413, 0.9500]

# Buyer Return Rate.
# Greater than 10% is excessive.
buyer_retrate = 'BuyerID_fracReturned1DivReturnedNotNull'
buyer_retrate_max = 0.1

# Logger
logger = logging.getLogger()

Extract, transform, and load

In [5]:
print(r"""`df`: Load and format sales transactions.""")
df_backup = pd.read_csv(path_csv)
df_backup.head()
`df`: Load and format sales transactions.
Out[5]:
BuyerID SellerID Mileage VIN SellingLocation CarMake SalePrice CarYear MMR JDPowersCat ... Salvage OVE Simulcast InLane PSIEligible SaleDate Autocheck_score ConditionReport DSEligible Returned
0 351757 441081 37055 1FAHP3M27CL374352 CAAI FORD 13000 2012 13050 COMPACT CAR ... 0 0 0 0 1 13-07-25 1.000000 NaN 1 0.0
1 351757 442235 248 1G1JE6SB7D4103706 DETA CHEVROLET 13900 2013 13600 COMPACT CAR ... 0 0 0 0 1 13-10-24 NaN NaN 1 NaN
2 351757 436280 39627 1G1PG5SC4C7102274 CAAI CHEVROLET 13800 2012 14050 MIDSIZE CAR ... 0 0 0 1 1 13-04-18 -2.666667 41 1 NaN
3 351757 436280 50773 1G1PG5SC5C7113820 CAAI CHEVROLET 12900 2012 13150 MIDSIZE CAR ... 0 0 0 1 1 13-04-18 1.000000 31 1 NaN
4 351757 434202 54133 1GNKVGED4BJ301146 SVAA CHEVROLET 17800 2011 18050 SUV ... 0 0 0 1 1 13-02-26 -1.400000 34 1 NaN

5 rows × 25 columns

In [6]:
df = demo.app_predict.predict.etl(df=df_backup)
df = demo.app_predict.predict.create_features(df=df, path_data_dir=path_data_dir)
with pd.option_context('display.max_rows', len(df.dtypes)):
    print(df.dtypes)
BuyerID                                                      object
SellerID                                                     object
Mileage                                                       int64
VIN                                                          object
SellingLocation                                              object
CarMake                                                      object
SalePrice                                                     int64
CarYear                                                       int64
MMR                                                           int64
JDPowersCat                                                  object
LIGHTG                                                        int64
LIGHTY                                                        int64
LIGHTR                                                        int64
PSI                                                           int64
Arbitrated                                                    int64
Salvage                                                       int64
OVE                                                           int64
Simulcast                                                     int64
InLane                                                        int64
PSIEligible                                                   int64
SaleDate                                             datetime64[ns]
Autocheck_score                                             float64
ConditionReport                                               int64
DSEligible                                                    int64
Returned                                                      int64
Returned_asm                                                  int64
SellingLocation_lat                                         float64
SellingLocation_lon                                         float64
JDPowersCat_COMPACTCAR                                      float64
JDPowersCat_EXCLUDED                                        float64
JDPowersCat_FULLSIZECAR                                     float64
JDPowersCat_LUXURYCAR                                       float64
JDPowersCat_MIDSIZECAR                                      float64
JDPowersCat_PICKUP                                          float64
JDPowersCat_SPORTSCAR                                       float64
JDPowersCat_SUV                                             float64
JDPowersCat_UNKNOWN                                         float64
JDPowersCat_VAN                                             float64
LIGHT_N0G1Y2R3                                                int64
SaleDate_dow                                                  int64
SaleDate_doy                                                  int64
SaleDate_day                                                  int64
SaleDate_decyear                                            float64
BuyerID_numTransactions                                       int64
BuyerID_numDSEligible1                                        int64
BuyerID_fracDSEligible1DivTransactions                      float64
BuyerID_numReturnedNotNull                                    int64
BuyerID_fracReturnedNotNullDivDSEligible1                   float64
BuyerID_numReturned1                                          int64
BuyerID_fracReturned1DivReturnedNotNull                     float64
BuyerID_numReturnedasm1                                       int64
BuyerID_fracReturnedasm1DivTransactions                     float64
SellerID_numTransactions                                      int64
SellerID_numDSEligible1                                       int64
SellerID_fracDSEligible1DivTransactions                     float64
SellerID_numReturnedNotNull                                   int64
SellerID_fracReturnedNotNullDivDSEligible1                  float64
SellerID_numReturned1                                         int64
SellerID_fracReturned1DivReturnedNotNull                    float64
SellerID_numReturnedasm1                                      int64
SellerID_fracReturnedasm1DivTransactions                    float64
VIN_numTransactions                                           int64
VIN_numDSEligible1                                            int64
VIN_fracDSEligible1DivTransactions                          float64
VIN_numReturnedNotNull                                        int64
VIN_fracReturnedNotNullDivDSEligible1                       float64
VIN_numReturned1                                              int64
VIN_fracReturned1DivReturnedNotNull                         float64
VIN_numReturnedasm1                                           int64
VIN_fracReturnedasm1DivTransactions                         float64
SellingLocation_numTransactions                               int64
SellingLocation_numDSEligible1                                int64
SellingLocation_fracDSEligible1DivTransactions              float64
SellingLocation_numReturnedNotNull                            int64
SellingLocation_fracReturnedNotNullDivDSEligible1           float64
SellingLocation_numReturned1                                  int64
SellingLocation_fracReturned1DivReturnedNotNull             float64
SellingLocation_numReturnedasm1                               int64
SellingLocation_fracReturnedasm1DivTransactions             float64
CarMake_numTransactions                                       int64
CarMake_numDSEligible1                                        int64
CarMake_fracDSEligible1DivTransactions                      float64
CarMake_numReturnedNotNull                                    int64
CarMake_fracReturnedNotNullDivDSEligible1                   float64
CarMake_numReturned1                                          int64
CarMake_fracReturned1DivReturnedNotNull                     float64
CarMake_numReturnedasm1                                       int64
CarMake_fracReturnedasm1DivTransactions                     float64
JDPowersCat_numTransactions                                   int64
JDPowersCat_numDSEligible1                                    int64
JDPowersCat_fracDSEligible1DivTransactions                  float64
JDPowersCat_numReturnedNotNull                                int64
JDPowersCat_fracReturnedNotNullDivDSEligible1               float64
JDPowersCat_numReturned1                                      int64
JDPowersCat_fracReturned1DivReturnedNotNull                 float64
JDPowersCat_numReturnedasm1                                   int64
JDPowersCat_fracReturnedasm1DivTransactions                 float64
dtype: object

Exploratory data analysis

In [23]:
# Describe columns
opt = pd.get_option('display.max_columns')
pd.set_option('display.max_columns', len(df.columns))
df.describe(include='all')
Out[23]:
Arbitrated Autocheck_score BuyerID_fracDSEligible1DivTransactions BuyerID_fracReturned1DivReturnedNotNull BuyerID_fracReturnedNotNullDivDSEligible1 BuyerID_fracReturnedasm1DivTransactions BuyerID_numDSEligible1 BuyerID_numReturned1 BuyerID_numReturnedNotNull BuyerID_numReturnedasm1 BuyerID_numTransactions CarMake_fracDSEligible1DivTransactions CarMake_fracReturned1DivReturnedNotNull CarMake_fracReturnedNotNullDivDSEligible1 CarMake_fracReturnedasm1DivTransactions CarMake_numDSEligible1 CarMake_numReturned1 CarMake_numReturnedNotNull CarMake_numReturnedasm1 CarMake_numTransactions CarYear ConditionReport DSEligible InLane JDPowersCat_COMPACTCAR JDPowersCat_EXCLUDED JDPowersCat_FULLSIZECAR JDPowersCat_LUXURYCAR JDPowersCat_MIDSIZECAR JDPowersCat_PICKUP JDPowersCat_SPORTSCAR JDPowersCat_SUV JDPowersCat_UNKNOWN JDPowersCat_VAN JDPowersCat_fracDSEligible1DivTransactions JDPowersCat_fracReturned1DivReturnedNotNull JDPowersCat_fracReturnedNotNullDivDSEligible1 JDPowersCat_fracReturnedasm1DivTransactions JDPowersCat_numDSEligible1 JDPowersCat_numReturned1 JDPowersCat_numReturnedNotNull JDPowersCat_numReturnedasm1 JDPowersCat_numTransactions LIGHTG LIGHTR LIGHTY LIGHT_N0G1Y2R3 MMR Mileage OVE PSI PSIEligible SaleDate_day SaleDate_decyear SaleDate_dow SaleDate_doy SalePrice Salvage SellerID_fracDSEligible1DivTransactions SellerID_fracReturned1DivReturnedNotNull SellerID_fracReturnedNotNullDivDSEligible1 SellerID_fracReturnedasm1DivTransactions SellerID_numDSEligible1 SellerID_numReturned1 SellerID_numReturnedNotNull SellerID_numReturnedasm1 SellerID_numTransactions SellingLocation_fracDSEligible1DivTransactions SellingLocation_fracReturned1DivReturnedNotNull SellingLocation_fracReturnedNotNullDivDSEligible1 SellingLocation_fracReturnedasm1DivTransactions SellingLocation_lat SellingLocation_lon SellingLocation_numDSEligible1 SellingLocation_numReturned1 SellingLocation_numReturnedNotNull SellingLocation_numReturnedasm1 SellingLocation_numTransactions Simulcast VIN_fracDSEligible1DivTransactions VIN_fracReturned1DivReturnedNotNull VIN_fracReturnedNotNullDivDSEligible1 VIN_fracReturnedasm1DivTransactions VIN_numDSEligible1 VIN_numReturned1 VIN_numReturnedNotNull VIN_numReturnedasm1 VIN_numTransactions
count 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000 278337.000000
mean 0.071316 1.198252 0.943581 0.107117 0.034551 0.063900 453.121827 1.252410 9.587022 12.178467 464.047884 0.946847 0.151509 0.028892 0.059119 9862.799344 83.209397 414.703399 669.181194 10448.771141 2007.407140 30.818285 0.944876 0.333700 0.125208 0.004800 0.011723 0.116345 0.221070 0.129473 0.032493 0.304778 0.001279 0.052832 0.946902 0.151373 0.028677 0.059050 25391.878335 242.096487 1102.838739 1535.855064 26685.636911 0.267977 0.217998 0.042592 1.007157 11388.524882 84228.544933 0.034034 0.306650 0.690659 15.596895 2013.526312 1.939318 193.630351 11973.232032 0.037555 0.944089 0.097179 0.039199 0.063685 494.175126 6.875238 32.126106 21.041644 508.341532 0.946303 0.106864 0.029225 0.059736 34.338469 -87.579034 8217.593320 116.052375 498.316293 630.544746 8732.085691 0.069459 0.944501 0.020815 0.072437 0.072814 0.981095 0.021837 0.079350 0.078822 1.038080
std 0.257353 1.715611 0.134171 0.254969 0.100233 0.138196 878.971988 6.337838 64.848962 37.019401 883.219232 0.033339 0.116539 0.029969 0.034017 10473.944904 125.020210 648.924844 792.745021 11147.755468 3.956124 5.129596 0.228222 0.471535 0.330955 0.069115 0.107637 0.320638 0.414968 0.335723 0.177306 0.460314 0.035741 0.223698 0.018795 0.101374 0.026053 0.019568 20343.521446 313.702224 1421.563472 1221.845520 21216.599910 0.442907 0.412887 0.201936 1.177273 9025.527367 54879.931708 0.181318 0.461104 0.462223 8.543311 0.259901 1.128183 95.123686 9083.860494 0.190118 0.158037 0.194698 0.108657 0.161155 1022.181316 41.863657 168.576836 64.173002 1040.906277 0.033447 0.136336 0.044604 0.037576 4.338165 9.354517 8746.189038 216.460544 910.684602 824.662178 9311.906488 0.254233 0.227985 0.141990 0.254448 0.255523 0.309004 0.150152 0.277162 0.274546 0.206104
min 0.000000 -32.599998 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1936.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.500000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 2013.081967 0.000000 31.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 21.342331 -157.894707 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
25% 0.000000 0.625000 0.962963 0.000000 0.000000 0.000000 34.000000 0.000000 0.000000 0.000000 38.000000 0.940678 0.000000 0.002083 0.049117 2039.000000 0.000000 6.000000 111.000000 2150.000000 2005.000000 30.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.939972 0.000000 0.002392 0.047186 8758.000000 0.000000 22.000000 521.000000 9265.000000 0.000000 0.000000 0.000000 0.000000 4775.000000 38778.000000 0.000000 0.000000 0.000000 8.000000 2013.303279 1.000000 112.000000 5600.000000 0.000000 0.959959 0.000000 0.000000 0.000000 10.000000 0.000000 0.000000 0.000000 12.000000 0.934462 0.000000 0.000000 0.032551 29.644441 -95.305058 1561.000000 0.000000 0.000000 60.000000 1636.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000
50% 0.000000 1.307692 0.993671 0.000000 0.000000 0.009146 104.000000 0.000000 0.000000 2.000000 112.000000 0.945253 0.185634 0.023974 0.058222 5616.000000 26.000000 109.000000 318.000000 5900.000000 2008.000000 30.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.945280 0.199100 0.023584 0.058522 20934.000000 94.000000 439.000000 1223.000000 22093.000000 0.000000 0.000000 0.000000 1.000000 9700.000000 79599.000000 0.000000 0.000000 1.000000 15.000000 2013.532787 2.000000 196.000000 10100.000000 0.000000 0.991255 0.000000 0.000000 0.014778 69.000000 0.000000 0.000000 2.000000 75.000000 0.947490 0.000000 0.000789 0.057628 34.294595 -86.444251 5179.000000 0.000000 2.000000 239.000000 5445.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000
75% 0.000000 2.250000 1.000000 0.000000 0.012658 0.055556 360.000000 0.000000 2.000000 10.000000 381.000000 0.956981 0.216374 0.052768 0.067680 14779.000000 109.000000 507.000000 950.000000 15642.000000 2011.000000 30.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.960821 0.222055 0.053513 0.068047 36523.000000 401.000000 1795.000000 2280.000000 38390.000000 1.000000 0.000000 0.000000 2.000000 15750.000000 114088.000000 0.000000 1.000000 1.000000 23.000000 2013.748634 3.000000 275.000000 16000.000000 0.000000 1.000000 0.128000 0.037333 0.052632 418.000000 1.000000 7.000000 12.000000 439.000000 0.968053 0.243865 0.052452 0.078275 38.399463 -79.894945 11784.000000 137.000000 592.000000 904.000000 12574.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000
max 1.000000 7.571429 1.000000 1.000000 1.000000 1.000000 5589.000000 157.000000 1452.000000 620.000000 5620.000000 1.000000 1.000000 1.000000 1.000000 44277.000000 569.000000 3241.000000 3364.000000 47072.000000 2014.000000 50.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.437500 0.078305 0.500000 81571.000000 1311.000000 6165.000000 4571.000000 84831.000000 1.000000 1.000000 1.000000 3.000000 172000.000000 999999.000000 1.000000 1.000000 1.000000 31.000000 2013.994536 6.000000 365.000000 355000.000000 1.000000 1.000000 1.000000 1.000000 1.000000 7054.000000 725.000000 2736.000000 886.000000 7094.000000 1.000000 1.000000 0.219342 1.000000 47.425773 -71.163175 39569.000000 955.000000 4324.000000 3641.000000 41994.000000 1.000000 1.000000 1.000000 1.000000 1.000000 5.000000 3.000000 4.000000 3.000000 6.000000
In [12]:
# re-initialize matplotlib inline in case handle dropped
%matplotlib inline
demo.app_predict.predict.plot_eda(
    df=df,
    columns=sorted(df.columns[np.logical_or(df.dtypes=='int64', df.dtypes=='float64')]),
    path_plot_dir=os.path.join(path_data_dir, 'plot_eda'))
################################################################################
Plot frequency distributions (histograms) of columns.
########################################
Feature: Arbitrated
Timestamp: 2017-02-28T07:12:17GMT
########################################
Feature: Autocheck_score
Timestamp: 2017-02-28T07:12:19GMT
########################################
Feature: BuyerID_fracDSEligible1DivTransactions
Timestamp: 2017-02-28T07:12:21GMT
########################################
Feature: BuyerID_fracReturned1DivReturnedNotNull
Timestamp: 2017-02-28T07:12:23GMT
########################################
Feature: BuyerID_fracReturnedNotNullDivDSEligible1
Timestamp: 2017-02-28T07:12:25GMT
########################################
Feature: BuyerID_fracReturnedasm1DivTransactions
Timestamp: 2017-02-28T07:12:27GMT
########################################
Feature: BuyerID_numDSEligible1
Timestamp: 2017-02-28T07:12:29GMT
########################################
Feature: BuyerID_numReturned1
Timestamp: 2017-02-28T07:12:31GMT
########################################
Feature: BuyerID_numReturnedNotNull
Timestamp: 2017-02-28T07:12:33GMT
########################################
Feature: BuyerID_numReturnedasm1
Timestamp: 2017-02-28T07:12:35GMT
########################################
Feature: BuyerID_numTransactions
Timestamp: 2017-02-28T07:12:37GMT
########################################
Feature: CarMake_fracDSEligible1DivTransactions
Timestamp: 2017-02-28T07:12:39GMT
########################################
Feature: CarMake_fracReturned1DivReturnedNotNull
Timestamp: 2017-02-28T07:12:41GMT
########################################
Feature: CarMake_fracReturnedNotNullDivDSEligible1
Timestamp: 2017-02-28T07:12:43GMT
########################################
Feature: CarMake_fracReturnedasm1DivTransactions
Timestamp: 2017-02-28T07:12:45GMT
########################################
Feature: CarMake_numDSEligible1
Timestamp: 2017-02-28T07:12:47GMT
########################################
Feature: CarMake_numReturned1
Timestamp: 2017-02-28T07:12:49GMT
########################################
Feature: CarMake_numReturnedNotNull
Timestamp: 2017-02-28T07:12:51GMT
########################################
Feature: CarMake_numReturnedasm1
Timestamp: 2017-02-28T07:12:53GMT
########################################
Feature: CarMake_numTransactions
Timestamp: 2017-02-28T07:12:54GMT
########################################
Feature: CarYear
Timestamp: 2017-02-28T07:12:56GMT
########################################
Feature: ConditionReport
Timestamp: 2017-02-28T07:12:58GMT
########################################
Feature: DSEligible
Timestamp: 2017-02-28T07:13:00GMT
########################################
Feature: InLane
Timestamp: 2017-02-28T07:13:02GMT
########################################
Feature: JDPowersCat_COMPACTCAR
Timestamp: 2017-02-28T07:13:04GMT
########################################
Feature: JDPowersCat_EXCLUDED
Timestamp: 2017-02-28T07:13:06GMT
########################################
Feature: JDPowersCat_FULLSIZECAR
Timestamp: 2017-02-28T07:13:08GMT
########################################
Feature: JDPowersCat_LUXURYCAR
Timestamp: 2017-02-28T07:13:10GMT
########################################
Feature: JDPowersCat_MIDSIZECAR
Timestamp: 2017-02-28T07:13:12GMT
########################################
Feature: JDPowersCat_PICKUP
Timestamp: 2017-02-28T07:13:14GMT
########################################
Feature: JDPowersCat_SPORTSCAR
Timestamp: 2017-02-28T07:13:16GMT
########################################
Feature: JDPowersCat_SUV
Timestamp: 2017-02-28T07:13:18GMT
########################################
Feature: JDPowersCat_UNKNOWN
Timestamp: 2017-02-28T07:13:20GMT
########################################
Feature: JDPowersCat_VAN
Timestamp: 2017-02-28T07:13:22GMT
########################################
Feature: JDPowersCat_fracDSEligible1DivTransactions
Timestamp: 2017-02-28T07:13:24GMT
########################################
Feature: JDPowersCat_fracReturned1DivReturnedNotNull
Timestamp: 2017-02-28T07:13:26GMT
########################################
Feature: JDPowersCat_fracReturnedNotNullDivDSEligible1
Timestamp: 2017-02-28T07:13:28GMT
########################################
Feature: JDPowersCat_fracReturnedasm1DivTransactions
Timestamp: 2017-02-28T07:13:30GMT
########################################
Feature: JDPowersCat_numDSEligible1
Timestamp: 2017-02-28T07:13:32GMT
########################################
Feature: JDPowersCat_numReturned1
Timestamp: 2017-02-28T07:13:33GMT
########################################
Feature: JDPowersCat_numReturnedNotNull
Timestamp: 2017-02-28T07:13:35GMT
########################################
Feature: JDPowersCat_numReturnedasm1
Timestamp: 2017-02-28T07:13:37GMT
########################################
Feature: JDPowersCat_numTransactions
Timestamp: 2017-02-28T07:13:39GMT
########################################
Feature: LIGHTG
Timestamp: 2017-02-28T07:13:41GMT
########################################
Feature: LIGHTR
Timestamp: 2017-02-28T07:13:43GMT
########################################
Feature: LIGHTY
Timestamp: 2017-02-28T07:13:45GMT
########################################
Feature: LIGHT_N0G1Y2R3
Timestamp: 2017-02-28T07:13:46GMT
########################################
Feature: MMR
Timestamp: 2017-02-28T07:13:48GMT
########################################
Feature: Mileage
Timestamp: 2017-02-28T07:13:50GMT
########################################
Feature: OVE
Timestamp: 2017-02-28T07:13:52GMT
########################################
Feature: PSI
Timestamp: 2017-02-28T07:13:54GMT
########################################
Feature: PSIEligible
Timestamp: 2017-02-28T07:13:55GMT
########################################
Feature: Returned
Timestamp: 2017-02-28T07:13:57GMT
########################################
Feature: Returned_asm
Timestamp: 2017-02-28T07:13:59GMT
########################################
Feature: SaleDate_day
Timestamp: 2017-02-28T07:14:01GMT
########################################
Feature: SaleDate_decyear
Timestamp: 2017-02-28T07:14:03GMT
########################################
Feature: SaleDate_dow
Timestamp: 2017-02-28T07:14:04GMT
########################################
Feature: SaleDate_doy
Timestamp: 2017-02-28T07:14:06GMT
########################################
Feature: SalePrice
Timestamp: 2017-02-28T07:14:08GMT
########################################
Feature: Salvage
Timestamp: 2017-02-28T07:14:10GMT
########################################
Feature: SellerID_fracDSEligible1DivTransactions
Timestamp: 2017-02-28T07:14:12GMT
########################################
Feature: SellerID_fracReturned1DivReturnedNotNull
Timestamp: 2017-02-28T07:14:14GMT
########################################
Feature: SellerID_fracReturnedNotNullDivDSEligible1
Timestamp: 2017-02-28T07:14:15GMT
########################################
Feature: SellerID_fracReturnedasm1DivTransactions
Timestamp: 2017-02-28T07:14:17GMT
########################################
Feature: SellerID_numDSEligible1
Timestamp: 2017-02-28T07:14:19GMT
########################################
Feature: SellerID_numReturned1
Timestamp: 2017-02-28T07:14:21GMT
########################################
Feature: SellerID_numReturnedNotNull
Timestamp: 2017-02-28T07:14:23GMT
########################################
Feature: SellerID_numReturnedasm1
Timestamp: 2017-02-28T07:14:25GMT
########################################
Feature: SellerID_numTransactions
Timestamp: 2017-02-28T07:14:26GMT
########################################
Feature: SellingLocation_fracDSEligible1DivTransactions
Timestamp: 2017-02-28T07:14:28GMT
########################################
Feature: SellingLocation_fracReturned1DivReturnedNotNull
Timestamp: 2017-02-28T07:14:30GMT
########################################
Feature: SellingLocation_fracReturnedNotNullDivDSEligible1
Timestamp: 2017-02-28T07:14:32GMT
########################################
Feature: SellingLocation_fracReturnedasm1DivTransactions
Timestamp: 2017-02-28T07:14:34GMT
########################################
Feature: SellingLocation_lat
Timestamp: 2017-02-28T07:14:36GMT
########################################
Feature: SellingLocation_lon
Timestamp: 2017-02-28T07:14:38GMT
########################################
Feature: SellingLocation_numDSEligible1
Timestamp: 2017-02-28T07:14:40GMT
########################################
Feature: SellingLocation_numReturned1
Timestamp: 2017-02-28T07:14:42GMT
########################################
Feature: SellingLocation_numReturnedNotNull
Timestamp: 2017-02-28T07:14:43GMT
########################################
Feature: SellingLocation_numReturnedasm1
Timestamp: 2017-02-28T07:14:45GMT
########################################
Feature: SellingLocation_numTransactions
Timestamp: 2017-02-28T07:14:47GMT
########################################
Feature: Simulcast
Timestamp: 2017-02-28T07:14:49GMT
########################################
Feature: VIN_fracDSEligible1DivTransactions
Timestamp: 2017-02-28T07:14:51GMT
########################################
Feature: VIN_fracReturned1DivReturnedNotNull
Timestamp: 2017-02-28T07:14:53GMT
########################################
Feature: VIN_fracReturnedNotNullDivDSEligible1
Timestamp: 2017-02-28T07:14:55GMT
########################################
Feature: VIN_fracReturnedasm1DivTransactions
Timestamp: 2017-02-28T07:14:56GMT
########################################
Feature: VIN_numDSEligible1
Timestamp: 2017-02-28T07:14:58GMT
########################################
Feature: VIN_numReturned1
Timestamp: 2017-02-28T07:15:00GMT
########################################
Feature: VIN_numReturnedNotNull
Timestamp: 2017-02-28T07:15:02GMT
########################################
Feature: VIN_numReturnedasm1
Timestamp: 2017-02-28T07:15:04GMT
########################################
Feature: VIN_numTransactions
Timestamp: 2017-02-28T07:15:06GMT
################################################################################
Plot traces (timeseries) for fractional quantities vs fraction of completed transactions.
########################################
Category column:    BuyerID
Transaction column: BuyerID_numTransactions
Fraction column:    BuyerID_fracDSEligible1DivTransactions
Timestamp: 2017-02-28T07:15:07GMT
########################################
Category column:    BuyerID
Transaction column: BuyerID_numTransactions
Fraction column:    BuyerID_fracReturnedNotNullDivDSEligible1
Timestamp: 2017-02-28T07:15:31GMT
########################################
Category column:    BuyerID
Transaction column: BuyerID_numTransactions
Fraction column:    BuyerID_fracReturned1DivReturnedNotNull
Timestamp: 2017-02-28T07:15:55GMT
########################################
Category column:    BuyerID
Transaction column: BuyerID_numTransactions
Fraction column:    BuyerID_fracReturnedasm1DivTransactions
Timestamp: 2017-02-28T07:16:19GMT
########################################
Category column:    SellerID
Transaction column: SellerID_numTransactions
Fraction column:    SellerID_fracDSEligible1DivTransactions
Timestamp: 2017-02-28T07:16:43GMT
########################################
Category column:    SellerID
Transaction column: SellerID_numTransactions
Fraction column:    SellerID_fracReturnedNotNullDivDSEligible1
Timestamp: 2017-02-28T07:17:06GMT
########################################
Category column:    SellerID
Transaction column: SellerID_numTransactions
Fraction column:    SellerID_fracReturned1DivReturnedNotNull
Timestamp: 2017-02-28T07:17:28GMT
########################################
Category column:    SellerID
Transaction column: SellerID_numTransactions
Fraction column:    SellerID_fracReturnedasm1DivTransactions
Timestamp: 2017-02-28T07:17:51GMT
########################################
Category column:    VIN
Transaction column: VIN_numTransactions
Fraction column:    VIN_fracDSEligible1DivTransactions
Timestamp: 2017-02-28T07:18:13GMT
########################################
Category column:    VIN
Transaction column: VIN_numTransactions
Fraction column:    VIN_fracReturnedNotNullDivDSEligible1
Timestamp: 2017-02-28T07:18:36GMT
########################################
Category column:    VIN
Transaction column: VIN_numTransactions
Fraction column:    VIN_fracReturned1DivReturnedNotNull
Timestamp: 2017-02-28T07:19:00GMT
########################################
Category column:    VIN
Transaction column: VIN_numTransactions
Fraction column:    VIN_fracReturnedasm1DivTransactions
Timestamp: 2017-02-28T07:19:23GMT
########################################
Category column:    SellingLocation
Transaction column: SellingLocation_numTransactions
Fraction column:    SellingLocation_fracDSEligible1DivTransactions
Timestamp: 2017-02-28T07:19:46GMT
########################################
Category column:    SellingLocation
Transaction column: SellingLocation_numTransactions
Fraction column:    SellingLocation_fracReturnedNotNullDivDSEligible1
Timestamp: 2017-02-28T07:20:08GMT
########################################
Category column:    SellingLocation
Transaction column: SellingLocation_numTransactions
Fraction column:    SellingLocation_fracReturned1DivReturnedNotNull
Timestamp: 2017-02-28T07:20:29GMT
########################################
Category column:    SellingLocation
Transaction column: SellingLocation_numTransactions
Fraction column:    SellingLocation_fracReturnedasm1DivTransactions
Timestamp: 2017-02-28T07:20:51GMT
########################################
Category column:    CarMake
Transaction column: CarMake_numTransactions
Fraction column:    CarMake_fracDSEligible1DivTransactions
Timestamp: 2017-02-28T07:21:12GMT
########################################
Category column:    CarMake
Transaction column: CarMake_numTransactions
Fraction column:    CarMake_fracReturnedNotNullDivDSEligible1
Timestamp: 2017-02-28T07:21:34GMT
########################################
Category column:    CarMake
Transaction column: CarMake_numTransactions
Fraction column:    CarMake_fracReturned1DivReturnedNotNull
Timestamp: 2017-02-28T07:21:55GMT
########################################
Category column:    CarMake
Transaction column: CarMake_numTransactions
Fraction column:    CarMake_fracReturnedasm1DivTransactions
Timestamp: 2017-02-28T07:22:16GMT
########################################
Category column:    JDPowersCat
Transaction column: JDPowersCat_numTransactions
Fraction column:    JDPowersCat_fracDSEligible1DivTransactions
Timestamp: 2017-02-28T07:22:38GMT
########################################
Category column:    JDPowersCat
Transaction column: JDPowersCat_numTransactions
Fraction column:    JDPowersCat_fracReturnedNotNullDivDSEligible1
Timestamp: 2017-02-28T07:23:00GMT
########################################
Category column:    JDPowersCat
Transaction column: JDPowersCat_numTransactions
Fraction column:    JDPowersCat_fracReturned1DivReturnedNotNull
Timestamp: 2017-02-28T07:23:22GMT
########################################
Category column:    JDPowersCat
Transaction column: JDPowersCat_numTransactions
Fraction column:    JDPowersCat_fracReturnedasm1DivTransactions
Timestamp: 2017-02-28T07:23:44GMT
In [15]:
print(textwrap.dedent("""\
    Cluster map of feature correlations with heirarchical relationships.
    The deeper of the dendrogram node, the higher (anti)correlated the features are.
    The Spearman rank correlation accommodates non-linear features.
    The pair plot is a scatter matrix plot of columns vs each other."""))

# Write column labels to csv file.
path_plot_dir = os.path.join(path_data_dir, 'plot_eda_extra')
cols_plot = df.columns[np.logical_or(df.dtypes=='int64', df.dtypes=='float64')]
ds_columns = pd.Series(cols_plot, name='column')
ds_columns.to_csv(
    os.path.join(path_plot_dir, 'eda-extra_index_column_map.csv'),
    header=True, index_label='index')

# Create clustermap.
df_tmp = df[cols_plot].sample(n=1000, replace=False, random_state=0)
df_tmp.columns = [np.where(ds_columns.values == col)[0][0] for col in df_tmp.columns]
cmap = sns.clustermap(df_tmp.corr(method='spearman'), figsize=(20, 20))
plt.savefig(
    os.path.join(path_plot_dir, 'eda-extra_clustermap.png'),
    bbox_inches='tight', dpi=300)
plt.show()

# Print and write ordered column labels to csv file
with pd.option_context('display.max_rows', len(cmap.data2d.columns)):
    print(ds_columns.loc[cmap.data2d.columns])
ds_columns.loc[cmap.data2d.columns].to_csv(
    os.path.join(path_plot_dir, 'eda-extra_clustermap_index_column_map.csv'),
    header=True, index_label='index')
Cluster map of feature correlations with heirarchical relationships.
The deeper of the dendrogram node, the higher (anti)correlated the features are.
The Spearman rank correlation accommodates non-linear features.
The pair plot is a scatter matrix plot of columns vs each other.
55                                   VIN_numDSEligible1
16                                           DSEligible
56                   VIN_fracDSEligible1DivTransactions
38               BuyerID_fracDSEligible1DivTransactions
13                                          PSIEligible
2                                               CarYear
1                                             SalePrice
3                                                   MMR
47              SellerID_fracDSEligible1DivTransactions
65       SellingLocation_fracDSEligible1DivTransactions
28                                      JDPowersCat_SUV
83           JDPowersCat_fracDSEligible1DivTransactions
12                                               InLane
4                                                LIGHTG
15                                      ConditionReport
24                                JDPowersCat_LUXURYCAR
74               CarMake_fracDSEligible1DivTransactions
25                               JDPowersCat_MIDSIZECAR
10                                                  OVE
19                                  SellingLocation_lat
20                                  SellingLocation_lon
7                                                   PSI
14                                      Autocheck_score
11                                            Simulcast
32                                         SaleDate_dow
21                               JDPowersCat_COMPACTCAR
26                                   JDPowersCat_PICKUP
5                                                LIGHTY
29                                  JDPowersCat_UNKNOWN
34                                         SaleDate_day
27                                JDPowersCat_SPORTSCAR
30                                      JDPowersCat_VAN
22                                 JDPowersCat_EXCLUDED
23                              JDPowersCat_FULLSIZECAR
80              CarMake_fracReturnedasm1DivTransactions
79                              CarMake_numReturnedasm1
72                              CarMake_numTransactions
73                               CarMake_numDSEligible1
41                                 BuyerID_numReturned1
42              BuyerID_fracReturned1DivReturnedNotNull
39                           BuyerID_numReturnedNotNull
40            BuyerID_fracReturnedNotNullDivDSEligible1
78              CarMake_fracReturned1DivReturnedNotNull
87          JDPowersCat_fracReturned1DivReturnedNotNull
81                          JDPowersCat_numTransactions
82                           JDPowersCat_numDSEligible1
75                           CarMake_numReturnedNotNull
77                                 CarMake_numReturned1
76            CarMake_fracReturnedNotNullDivDSEligible1
85        JDPowersCat_fracReturnedNotNullDivDSEligible1
33                                         SaleDate_doy
35                                     SaleDate_decyear
88                          JDPowersCat_numReturnedasm1
84                       JDPowersCat_numReturnedNotNull
86                             JDPowersCat_numReturned1
69      SellingLocation_fracReturned1DivReturnedNotNull
68                         SellingLocation_numReturned1
66                   SellingLocation_numReturnedNotNull
67    SellingLocation_fracReturnedNotNullDivDSEligible1
70                      SellingLocation_numReturnedasm1
63                      SellingLocation_numTransactions
64                       SellingLocation_numDSEligible1
50                                SellerID_numReturned1
51             SellerID_fracReturned1DivReturnedNotNull
48                          SellerID_numReturnedNotNull
49           SellerID_fracReturnedNotNullDivDSEligible1
36                              BuyerID_numTransactions
37                               BuyerID_numDSEligible1
45                             SellerID_numTransactions
46                              SellerID_numDSEligible1
9                                               Salvage
18                                         Returned_asm
61                                  VIN_numReturnedasm1
62                  VIN_fracReturnedasm1DivTransactions
31                                       LIGHT_N0G1Y2R3
0                                               Mileage
6                                                LIGHTR
59                                     VIN_numReturned1
60                  VIN_fracReturned1DivReturnedNotNull
17                                             Returned
57                               VIN_numReturnedNotNull
58                VIN_fracReturnedNotNullDivDSEligible1
8                                            Arbitrated
54                                  VIN_numTransactions
89          JDPowersCat_fracReturnedasm1DivTransactions
43                              BuyerID_numReturnedasm1
44              BuyerID_fracReturnedasm1DivTransactions
71      SellingLocation_fracReturnedasm1DivTransactions
52                             SellerID_numReturnedasm1
53             SellerID_fracReturnedasm1DivTransactions
Name: column, dtype: object
In [16]:
print(textwrap.dedent("""\
    Cluster map of record correlations with heirarchical relationships."""))

# Create clustermap.
sns.clustermap(df_tmp.transpose().corr(method='spearman'), figsize=(20, 20))
plt.savefig(
    os.path.join(path_plot_dir, 'eda-extra_clustermap_records.png'),
    bbox_inches='tight', dpi=300)
plt.show()
Cluster map of record correlations with heirarchical relationships.
In [17]:
print("Pairplot of columns (ordered like clustermap):")
buyer_retrate_idx = np.where(ds_columns.values == buyer_retrate)[0][0]
idxs_top_corr = cmap.data2d[buyer_retrate_idx].abs().sort_values(ascending=False)[:6].index
tfmask_top_corr = cmap.data2d.index.isin(idxs_top_corr)
col_hue = 'buyer_retrate_gt01'
cols_plot = cmap.data2d.columns[tfmask_top_corr].append(pd.Index([col_hue]))
df_tmp[col_hue] = df_tmp[buyer_retrate_idx] > buyer_retrate_max

sns.pairplot(
    df_tmp[cols_plot], hue=col_hue, diag_kind='hist', markers=['.', 'o'],
    palette=[sns.color_palette()[1], sns.color_palette()[2]],
    plot_kws={'alpha':1.0})
plt.savefig(
    os.path.join(path_plot_dir, 'eda-extra_pairplot.png'),
    bbox_inches='tight', dpi=300)
plt.show()
Pairplot of columns (ordered like clustermap):
In [23]:
del df_tmp
gc.collect()
Out[23]:
233416

Heuristic model

In [13]:
# re-initialize matplotlib inline in case handle dropped
%matplotlib inline
demo.app_predict.predict.plot_heuristic(
    df=df,
    path_plot_dir=os.path.join(path_data_dir, 'plot_heuristic'))

Test heuristic

Note: The heuristic isn't "trained", just calculated (no iterations). TODO:

  • Bootstrap to get uncertainties.
  • Make list of bad dealers
  • Predict 'Returned': P(Returned < 0.1) then P(Dealer_retrate < 0.1)
  • Don't remove Returned == -1 since that means not DealShield approved or yes DealShield approved but not purchased.
In [15]:
date_range = pd.date_range(start=df['SaleDate'].min(), end=df['SaleDate'].max(), freq='W')
df_modl = df.loc[
    np.logical_and(
        date_range[0] <= df['SaleDate'],
        df['SaleDate'] < date_range[1])].copy()
df_orig = df.loc[
    np.logical_and(
        date_range[0] <= df['SaleDate'],
        df['SaleDate'] < date_range[1])].copy()
buyers = np.asarray([])
buyers_prohibited = dict() # key: (df_train['SaleDate'].min(), df_train['SaleDate'].max())
transactions_affected = dict() # key: (df_test['SaleDate'].min(), df_test['SaleDate'].max())
retrates_modl_chunk = dict() # key: (df_test['SaleDate'].min(), df_test['SaleDate'].max())
retrates_orig_chunk = dict() # key: (df_eval['SaleDate'].min(), df_eval['SaleDate'].max())
retrates_modl_all = dict() # key: (df_modl['SaleDate'].min(), df_modl['SaleDate'].max())
retrates_orig_all = dict() # key: (df_orig['SaleDate'].min(), df_orig['SaleDate'].max())

for idx in range(len(date_range)-2):
    print('#'*40)
    print('Timestamp:', time.strftime(r'%Y-%m-%dT%H:%M:%S%Z', time.gmtime()))
    
    # Define data sets.
    (saledate_train_min, saledate_train_max) = (date_range[idx],   date_range[idx+1])
    (saledate_test_min,  saledate_test_max)  = (date_range[idx+1], date_range[idx+2])
    (saledate_eval_min,  saledate_eval_max)  = (date_range[idx+1], date_range[idx+2])
    df_train = df_modl.loc[
        np.logical_and(
            saledate_train_min <= df['SaleDate'],
            df['SaleDate'] < saledate_train_max)].copy()
    df_test = df.loc[
        np.logical_and(
            saledate_test_min <= df['SaleDate'],
            df['SaleDate'] < saledate_test_max)].copy()
    df_eval = df.loc[
        np.logical_and(
            saledate_eval_min <= df['SaleDate'],
            df['SaleDate'] < saledate_eval_max)].copy()
    
    # Prohibit purchase of DealShield in test data:
    # If prohibiting purchase of DealShield,
    # then for prohibed buyers, set DSEligible = 0 and Returned = -1
    buyers = np.unique(
        np.append(
            buyers,
            df_train.loc[df_train[buyer_retrate] > buyer_retrate_max, 'BuyerID'].unique()))
    buyers_prohibited[(df_train['SaleDate'].min(), df_train['SaleDate'].max())] = buyers
    tfmask = np.logical_and(
        np.logical_and(
            df_test['SaleDate'] > df_train['SaleDate'].max(),
            df_test['BuyerID'].isin(buyers)),
        df_test['DSEligible'] == 1)
    transactions_affected[(df_test['SaleDate'].min(), df_test['SaleDate'].max())] = tfmask.loc[tfmask].index
    df_test.loc[tfmask, 'DSEligible'] = 0
    df_test.loc[tfmask, 'Returned'] = -1
    
    # Update calculated features.
    df_modl = demo.app_predict.predict.update_features_append(
        df_prev=df_modl, df_next=df_test, debug=False)
    df_orig = df_orig.append(df_eval)
    
    # Calculate chunk return rates.
    df_chunk = df_modl.loc[
        np.logical_and(
            saledate_test_min <= df_modl['SaleDate'],
            df_modl['SaleDate'] < saledate_test_max)]
    retrate_modl = sum(df_chunk['Returned']==1)/sum(df_chunk['Returned']!=-1)
    df_chunk = df_orig.loc[
        np.logical_and(
            saledate_eval_min <= df_orig['SaleDate'],
            df_orig['SaleDate'] < saledate_eval_max)]
    retrate_orig = sum(df_chunk['Returned']==1)/sum(df_chunk['Returned']!=-1)
    retrates_modl_chunk[(saledate_test_min, saledate_test_max)] = retrate_modl
    retrates_orig_chunk[(saledate_eval_min, saledate_eval_max)] = retrate_orig
    print('Chunk:')
    print('Evaluation time span:', df_chunk['SaleDate'].min(), df_chunk['SaleDate'].max())
    print('Model return rate:   ', retrate_modl)
    print('Original return rate:', retrate_orig)

    # Calculate overall return rates.
    retrate_modl = sum(df_modl['Returned']==1)/sum(df_modl['Returned']!=-1)
    retrate_orig = sum(df_orig['Returned']==1)/sum(df_orig['Returned']!=-1)
    retrates_modl_all[(df_modl['SaleDate'].min(), df_modl['SaleDate'].max())] = retrate_modl
    retrates_orig_all[(df_orig['SaleDate'].min(), df_orig['SaleDate'].max())] = retrate_orig
    print('Overall:')
    print('Evaluation time span:', df_orig['SaleDate'].min(), df_orig['SaleDate'].max())
    print('Model return rate:   ', retrate_modl)
    print('Original return rate:', retrate_orig)
########################################
Timestamp: 2017-02-28T19:27:11GMT
Chunk:
Evaluation time span: 2013-02-10 00:00:00 2013-02-16 00:00:00
Model return rate:    nan
Original return rate: nan
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-02-16 00:00:00
Model return rate:    nan
Original return rate: nan
########################################
Timestamp: 2017-02-28T19:27:12GMT
Chunk:
Evaluation time span: 2013-02-17 00:00:00 2013-02-23 00:00:00
Model return rate:    nan
Original return rate: nan
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-02-23 00:00:00
Model return rate:    nan
Original return rate: nan
########################################
Timestamp: 2017-02-28T19:27:12GMT
Chunk:
Evaluation time span: 2013-02-24 00:00:00 2013-03-02 00:00:00
Model return rate:    nan
Original return rate: nan
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-03-02 00:00:00
Model return rate:    nan
Original return rate: nan
########################################
Timestamp: 2017-02-28T19:27:13GMT
Chunk:
Evaluation time span: 2013-03-03 00:00:00 2013-03-09 00:00:00
Model return rate:    nan
Original return rate: nan
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-03-09 00:00:00
Model return rate:    nan
Original return rate: nan
########################################
Timestamp: 2017-02-28T19:27:14GMT
Chunk:
Evaluation time span: 2013-03-10 00:00:00 2013-03-16 00:00:00
Model return rate:    nan
Original return rate: nan
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-03-16 00:00:00
Model return rate:    nan
Original return rate: nan
########################################
Timestamp: 2017-02-28T19:27:16GMT
Chunk:
Evaluation time span: 2013-03-17 00:00:00 2013-03-23 00:00:00
Model return rate:    0.0
Original return rate: 0.0
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-03-23 00:00:00
Model return rate:    0.0
Original return rate: 0.0
########################################
Timestamp: 2017-02-28T19:27:17GMT
Chunk:
Evaluation time span: 2013-03-25 00:00:00 2013-03-30 00:00:00
Model return rate:    0.0
Original return rate: 0.0
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-03-30 00:00:00
Model return rate:    0.0
Original return rate: 0.0
########################################
Timestamp: 2017-02-28T19:27:18GMT
Chunk:
Evaluation time span: 2013-03-31 00:00:00 2013-04-06 00:00:00
Model return rate:    0.0263157894737
Original return rate: 0.0263157894737
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-04-06 00:00:00
Model return rate:    0.02
Original return rate: 0.02
########################################
Timestamp: 2017-02-28T19:27:20GMT
Chunk:
Evaluation time span: 2013-04-07 00:00:00 2013-04-13 00:00:00
Model return rate:    0.0
Original return rate: 0.0
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-04-13 00:00:00
Model return rate:    0.0103092783505
Original return rate: 0.0103092783505
########################################
Timestamp: 2017-02-28T19:27:22GMT
Chunk:
Evaluation time span: 2013-04-14 00:00:00 2013-04-19 00:00:00
Model return rate:    0.0545454545455
Original return rate: 0.0535714285714
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-04-19 00:00:00
Model return rate:    0.0263157894737
Original return rate: 0.0261437908497
########################################
Timestamp: 2017-02-28T19:27:23GMT
Chunk:
Evaluation time span: 2013-04-21 00:00:00 2013-04-27 00:00:00
Model return rate:    0.0588235294118
Original return rate: 0.0576923076923
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-04-27 00:00:00
Model return rate:    0.0344827586207
Original return rate: 0.0341463414634
########################################
Timestamp: 2017-02-28T19:27:25GMT
Chunk:
Evaluation time span: 2013-04-28 00:00:00 2013-05-04 00:00:00
Model return rate:    0.0816326530612
Original return rate: 0.0816326530612
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-05-04 00:00:00
Model return rate:    0.0436507936508
Original return rate: 0.0433070866142
########################################
Timestamp: 2017-02-28T19:27:28GMT
Chunk:
Evaluation time span: 2013-05-05 00:00:00 2013-05-11 00:00:00
Model return rate:    0.281481481481
Original return rate: 0.274647887324
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-05-11 00:00:00
Model return rate:    0.12661498708
Original return rate: 0.126262626263
########################################
Timestamp: 2017-02-28T19:27:29GMT
Chunk:
Evaluation time span: 2013-05-12 00:00:00 2013-05-18 00:00:00
Model return rate:    0.2625
Original return rate: 0.269662921348
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-05-18 00:00:00
Model return rate:    0.149892933619
Original return rate: 0.152577319588
########################################
Timestamp: 2017-02-28T19:27:31GMT
Chunk:
Evaluation time span: 2013-05-19 00:00:00 2013-05-25 00:00:00
Model return rate:    0.168604651163
Original return rate: 0.185567010309
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-05-25 00:00:00
Model return rate:    0.154929577465
Original return rate: 0.162002945508
########################################
Timestamp: 2017-02-28T19:27:34GMT
Chunk:
Evaluation time span: 2013-05-26 00:00:00 2013-05-31 00:00:00
Model return rate:    0.263157894737
Original return rate: 0.298780487805
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-05-31 00:00:00
Model return rate:    0.173575129534
Original return rate: 0.188612099644
########################################
Timestamp: 2017-02-28T19:27:36GMT
Chunk:
Evaluation time span: 2013-06-02 00:00:00 2013-06-08 00:00:00
Model return rate:    0.335766423358
Original return rate: 0.30243902439
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-06-08 00:00:00
Model return rate:    0.19801980198
Original return rate: 0.210877862595
########################################
Timestamp: 2017-02-28T19:27:38GMT
Chunk:
Evaluation time span: 2013-06-10 00:00:00 2013-06-15 00:00:00
Model return rate:    0.286516853933
Original return rate: 0.278195488722
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-06-15 00:00:00
Model return rate:    0.21251149954
Original return rate: 0.224505327245
########################################
Timestamp: 2017-02-28T19:27:41GMT
Chunk:
Evaluation time span: 2013-06-16 00:00:00 2013-06-22 00:00:00
Model return rate:    0.106194690265
Original return rate: 0.166153846154
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-06-22 00:00:00
Model return rate:    0.194211728865
Original return rate: 0.21293471629
########################################
Timestamp: 2017-02-28T19:27:44GMT
Chunk:
Evaluation time span: 2013-06-23 00:00:00 2013-06-29 00:00:00
Model return rate:    0.187311178248
Original return rate: 0.203187250996
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-06-29 00:00:00
Model return rate:    0.192822384428
Original return rate: 0.210649229332
########################################
Timestamp: 2017-02-28T19:27:47GMT
Chunk:
Evaluation time span: 2013-06-30 00:00:00 2013-07-06 00:00:00
Model return rate:    0.166666666667
Original return rate: 0.220708446866
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-07-06 00:00:00
Model return rate:    0.189345991561
Original return rate: 0.212121212121
########################################
Timestamp: 2017-02-28T19:27:50GMT
Chunk:
Evaluation time span: 2013-07-07 00:00:00 2013-07-13 00:00:00
Model return rate:    0.185915492958
Original return rate: 0.2
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-07-13 00:00:00
Model return rate:    0.188804975566
Original return rate: 0.20987654321
########################################
Timestamp: 2017-02-28T19:27:53GMT
Chunk:
Evaluation time span: 2013-07-14 00:00:00 2013-07-20 00:00:00
Model return rate:    0.189102564103
Original return rate: 0.244698205546
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-07-20 00:00:00
Model return rate:    0.188841201717
Original return rate: 0.215659712815
########################################
Timestamp: 2017-02-28T19:27:57GMT
Chunk:
Evaluation time span: 2013-07-21 00:00:00 2013-07-27 00:00:00
Model return rate:    0.22027972028
Original return rate: 0.269097222222
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-07-27 00:00:00
Model return rate:    0.191997191997
Original return rate: 0.22287321303
########################################
Timestamp: 2017-02-28T19:28:00GMT
Chunk:
Evaluation time span: 2013-07-28 00:00:00 2013-08-03 00:00:00
Model return rate:    0.219594594595
Original return rate: 0.256944444444
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-08-03 00:00:00
Model return rate:    0.194594594595
Original return rate: 0.226925459426
########################################
Timestamp: 2017-02-28T19:28:04GMT
Chunk:
Evaluation time span: 2013-08-04 00:00:00 2013-08-10 00:00:00
Model return rate:    0.232067510549
Original return rate: 0.252727272727
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-08-10 00:00:00
Model return rate:    0.197220579539
Original return rate: 0.229556832932
########################################
Timestamp: 2017-02-28T19:28:08GMT
Chunk:
Evaluation time span: 2013-08-11 00:00:00 2013-08-17 00:00:00
Model return rate:    0.164086687307
Original return rate: 0.230529595016
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-08-17 00:00:00
Model return rate:    0.194331983806
Original return rate: 0.22966031483
########################################
Timestamp: 2017-02-28T19:28:11GMT
Chunk:
Evaluation time span: 2013-08-18 00:00:00 2013-08-24 00:00:00
Model return rate:    0.12962962963
Original return rate: 0.184568835098
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-08-24 00:00:00
Model return rate:    0.189128816083
Original return rate: 0.225209080048
########################################
Timestamp: 2017-02-28T19:28:15GMT
Chunk:
Evaluation time span: 2013-08-25 00:00:00 2013-08-31 00:00:00
Model return rate:    0.152941176471
Original return rate: 0.225543478261
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-08-31 00:00:00
Model return rate:    0.186312657359
Original return rate: 0.22524219591
########################################
Timestamp: 2017-02-28T19:28:19GMT
Chunk:
Evaluation time span: 2013-09-01 00:00:00 2013-09-07 00:00:00
Model return rate:    0.146179401993
Original return rate: 0.193602693603
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-09-07 00:00:00
Model return rate:    0.183725910064
Original return rate: 0.222900573137
########################################
Timestamp: 2017-02-28T19:28:23GMT
Chunk:
Evaluation time span: 2013-09-08 00:00:00 2013-09-14 00:00:00
Model return rate:    0.164864864865
Original return rate: 0.221621621622
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-09-14 00:00:00
Model return rate:    0.182341269841
Original return rate: 0.222792607803
########################################
Timestamp: 2017-02-28T19:28:29GMT
Chunk:
Evaluation time span: 2013-09-15 00:00:00 2013-09-21 00:00:00
Model return rate:    0.165413533835
Original return rate: 0.235378031384
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-09-21 00:00:00
Model return rate:    0.18149264983
Original return rate: 0.223724516742
########################################
Timestamp: 2017-02-28T19:28:36GMT
Chunk:
Evaluation time span: 2013-09-22 00:00:00 2013-09-28 00:00:00
Model return rate:    0.107023411371
Original return rate: 0.198550724638
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-09-28 00:00:00
Model return rate:    0.177520071365
Original return rate: 0.222014374323
########################################
Timestamp: 2017-02-28T19:28:41GMT
Chunk:
Evaluation time span: 2013-09-29 00:00:00 2013-10-05 00:00:00
Model return rate:    0.134375
Original return rate: 0.236694677871
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-10-05 00:00:00
Model return rate:    0.175189873418
Original return rate: 0.222978566829
########################################
Timestamp: 2017-02-28T19:28:47GMT
Chunk:
Evaluation time span: 2013-10-06 00:00:00 2013-10-12 00:00:00
Model return rate:    0.183946488294
Original return rate: 0.260740740741
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-10-12 00:00:00
Model return rate:    0.175610539846
Original return rate: 0.225186211675
########################################
Timestamp: 2017-02-28T19:28:55GMT
Chunk:
Evaluation time span: 2013-10-13 00:00:00 2013-10-19 00:00:00
Model return rate:    0.164658634538
Original return rate: 0.280575539568
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-10-19 00:00:00
Model return rate:    0.175189247644
Original return rate: 0.228331018708
########################################
Timestamp: 2017-02-28T19:29:01GMT
Chunk:
Evaluation time span: 2013-10-20 00:00:00 2013-10-26 00:00:00
Model return rate:    0.174418604651
Original return rate: 0.238993710692
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-10-26 00:00:00
Model return rate:    0.175150359396
Original return rate: 0.228981282602
########################################
Timestamp: 2017-02-28T19:29:07GMT
Chunk:
Evaluation time span: 2013-10-27 00:00:00 2013-11-02 00:00:00
Model return rate:    0.171662125341
Original return rate: 0.236220472441
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-11-02 00:00:00
Model return rate:    0.174972160356
Original return rate: 0.22938106972
########################################
Timestamp: 2017-02-28T19:29:14GMT
Chunk:
Evaluation time span: 2013-11-03 00:00:00 2013-11-09 00:00:00
Model return rate:    0.125944584383
Original return rate: 0.179761904762
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-11-09 00:00:00
Model return rate:    0.17240469595
Original return rate: 0.226533679464
########################################
Timestamp: 2017-02-28T19:29:21GMT
Chunk:
Evaluation time span: 2013-11-10 00:00:00 2013-11-16 00:00:00
Model return rate:    0.114525139665
Original return rate: 0.178717598909
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-11-16 00:00:00
Model return rate:    0.169794684469
Original return rate: 0.224253464316
########################################
Timestamp: 2017-02-28T19:29:28GMT
Chunk:
Evaluation time span: 2013-11-17 00:00:00 2013-11-23 00:00:00
Model return rate:    0.124324324324
Original return rate: 0.183979974969
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-11-23 00:00:00
Model return rate:    0.167769888073
Original return rate: 0.222263450835
########################################
Timestamp: 2017-02-28T19:29:36GMT
Chunk:
Evaluation time span: 2013-11-24 00:00:00 2013-11-30 00:00:00
Model return rate:    0.0877192982456
Original return rate: 0.181818181818
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-11-30 00:00:00
Model return rate:    0.166155660377
Original return rate: 0.221296631655
########################################
Timestamp: 2017-02-28T19:29:43GMT
Chunk:
Evaluation time span: 2013-12-01 00:00:00 2013-12-07 00:00:00
Model return rate:    0.105849582173
Original return rate: 0.175675675676
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-12-07 00:00:00
Model return rate:    0.163706301618
Original return rate: 0.219345891598
########################################
Timestamp: 2017-02-28T19:29:51GMT
Chunk:
Evaluation time span: 2013-12-09 00:00:00 2013-12-14 00:00:00
Model return rate:    0.0692124105012
Original return rate: 0.170186335404
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-12-14 00:00:00
Model return rate:    0.159429682437
Original return rate: 0.217160841478
########################################
Timestamp: 2017-02-28T19:29:59GMT
Chunk:
Evaluation time span: 2013-12-15 00:00:00 2013-12-20 00:00:00
Model return rate:    0.061135371179
Original return rate: 0.144144144144
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-12-20 00:00:00
Model return rate:    0.154796212433
Original return rate: 0.212944175641
########################################
Timestamp: 2017-02-28T19:30:07GMT
Chunk:
Evaluation time span: 2013-12-22 00:00:00 2013-12-28 00:00:00
Model return rate:    0.0645161290323
Original return rate: 0.142857142857
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-12-28 00:00:00
Model return rate:    0.153940258946
Original return rate: 0.212037179685
In [21]:
# Overall return rate is Returned==1 / (Returned not null)
print("Plot performance of heuristic")
path_plot_dir = os.path.join(path_data_dir, 'plot_performance_heuristic')

# Plot overall return rate.
(xvals_orig, yvals_orig) = zip(*[(key[1], val) for (key, val) in sorted(retrates_orig_all.items())])
(xvals_modl, yvals_modl) = zip(*[(key[1], val) for (key, val) in sorted(retrates_modl_all.items())])
(xvals_diff, yvals_diff) = (xvals_orig, np.subtract(yvals_modl, yvals_orig))
plt.plot(
    xvals_orig, yvals_orig, marker='.', linestyle='-',
    color=sns.color_palette()[0], label='original return rate')
plt.plot(
    xvals_modl, yvals_modl, marker='.', linestyle='-',
    color=sns.color_palette()[1], label='model return rate')
plt.plot(
    xvals_diff, yvals_diff, marker='.', linestyle='--',
    color=sns.color_palette()[3], label='diff model-original')
plt.title("Return rates vs SaleDate")
plt.xlabel("SaleDate")
plt.ylabel("Overall return rate")
plt.legend(loc='upper left')
plt.tight_layout()
plt.savefig(
    os.path.join(path_plot_dir, 'perf-heur_returnrate_vs_saledate.png'),
    dpi=300)
plt.show()

# Plot return rate by buyer.
# Note: to get buyerid with col max: df_orig.loc[df_orig[col].argmax(), 'BuyerID']
buyerids = [
    '272356', # buyer with max BuyerID_numReturnedNotNull (max accepted DealShield)
    '328701', # buyer with max BuyerID_numReturned1 (max returns)
    '179863', # buyer with max BuyerID_fracReturned1DivReturnedNotNull (max return rate) and frequenly prohibited (36 weeks)
    '46857', # buyers frequently prohibited (38 weeks)
    '62851', # buyers frequently prohibited (36 weeks)
    '16640', # buyers frequently prohibited (36 weeks)
    '61773', # buyers frequently prohibited (35 weeks)
    '20718', # buyers frequently prohibited (35 weeks)
    '18584', # buyers frequently prohibited (34 weeks)
    '248009'] # buyers frequently prohibited (34 weeks)

for buyerid in buyerids:
    print('#'*40)
    print('BuyerID:', buyerid)

    fig = plt.figure()
    ax0 = fig.add_subplot(111)
    ax0.set_title(textwrap.dedent("""\
        Returns and return rates vs num transactions
        for BuyerID={buyerid}""".format(buyerid=buyerid)))
    ax0.set_xlabel('BuyerID_numTransactions')

    ax0.set_ylabel('BuyerID_numReturned1')
    lns0 = list()
    xvals_orig = df_orig.loc[df_orig['BuyerID']==buyerid, 'BuyerID_numTransactions'].values
    yvals0_orig = df_orig.loc[df_orig['BuyerID']==buyerid, 'BuyerID_numReturned1'].values
    lns0 += ax0.plot(
        xvals_orig, yvals0_orig, marker='.', linestyle='-', 
        color=sns.color_palette(palette='dark')[0], label='original BuyerID_numReturned1')
    xvals_modl = df_modl.loc[df_modl['BuyerID']==buyerid, 'BuyerID_numTransactions'].values
    yvals0_modl = df_modl.loc[df_modl['BuyerID']==buyerid, 'BuyerID_numReturned1'].values
    lns0 += ax0.plot(
        xvals_modl, yvals0_modl, marker='.', linestyle='-', 
        color=sns.color_palette(palette=None)[0], label='model BuyerID_numReturned1')
    ylim0 = (
        min(min(yvals0_orig), min(yvals0_modl)),
        max(max(max(yvals0_orig), max(yvals0_modl)), 1))
    ax0.set_ylim(ylim0)
    nticks = 6
    ax0.set_yticks(np.linspace(start=ylim0[0], stop=ylim0[1], num=nticks, endpoint=True))
    ax0.legend(lns0, [ln.get_label() for ln in lns0], loc='upper left')

    ax1 = ax0.twinx()
    ax1.set_ylabel(buyer_retrate)
    lns1 = list()
    yvals1_orig = df_orig.loc[df_orig['BuyerID']==buyerid, buyer_retrate].values
    lns1 += ax1.plot(
        xvals_orig, yvals1_orig, marker='.', linestyle='-', 
        color=sns.color_palette(palette='dark')[1], label='original '+buyer_retrate)
    yvals1_modl = df_modl.loc[df_modl['BuyerID']==buyerid, buyer_retrate].values
    lns1 += ax1.plot(
        xvals_modl, yvals1_modl, marker='.', linestyle='-', 
        color=sns.color_palette(palette=None)[1], label='model '+buyer_retrate)
    ylim1 = (0, 1)
    ax1.set_ylim(ylim1)
    ax1.set_yticks(np.linspace(start=ylim1[0], stop=ylim1[1], num=nticks, endpoint=True))
    ax1.legend(lns1, [ln.get_label() for ln in lns1], loc='lower right')
    plt.tight_layout()
    plt.savefig(
        os.path.join(path_plot_dir, 'perf-heur_returnrate_vs_transactions_for_'+buyerid+'.png'),
        dpi=300)
    plt.show(fig)
Plot performance of heuristic
########################################
BuyerID: 272356
########################################
BuyerID: 328701
########################################
BuyerID: 179863
########################################
BuyerID: 46857
########################################
BuyerID: 62851
########################################
BuyerID: 16640
########################################
BuyerID: 61773
/opt/conda/lib/python3.5/site-packages/matplotlib/axes/_base.py:3045: UserWarning: Attempting to set identical bottom==top results
in singular transformations; automatically expanding.
bottom=1, top=1
  'bottom=%s, top=%s') % (bottom, top))
########################################
BuyerID: 20718
########################################
BuyerID: 18584
########################################
BuyerID: 248009
In [52]:
del df_chunk, df_eval, df_modl, df_orig, df_test, df_train
gc.collect()

Pipeline model

TODO:

  • Try with data subset DSEligible==1, then use LDA instead of PCA to create the features to separate Returned=1/0
In [12]:
# Define target and features
target = 'Returned'
features = set(df.columns[np.logical_or(df.dtypes=='int64', df.dtypes=='float64')])
features.difference_update([target])
features = sorted(features)
features
Out[12]:
['Arbitrated',
 'Autocheck_score',
 'BuyerID_fracDSEligible1DivTransactions',
 'BuyerID_fracReturned1DivReturnedNotNull',
 'BuyerID_fracReturnedNotNullDivDSEligible1',
 'BuyerID_fracReturnedasm1DivTransactions',
 'BuyerID_numDSEligible1',
 'BuyerID_numReturned1',
 'BuyerID_numReturnedNotNull',
 'BuyerID_numReturnedasm1',
 'BuyerID_numTransactions',
 'CarMake_fracDSEligible1DivTransactions',
 'CarMake_fracReturned1DivReturnedNotNull',
 'CarMake_fracReturnedNotNullDivDSEligible1',
 'CarMake_fracReturnedasm1DivTransactions',
 'CarMake_numDSEligible1',
 'CarMake_numReturned1',
 'CarMake_numReturnedNotNull',
 'CarMake_numReturnedasm1',
 'CarMake_numTransactions',
 'CarYear',
 'ConditionReport',
 'DSEligible',
 'InLane',
 'JDPowersCat_COMPACTCAR',
 'JDPowersCat_EXCLUDED',
 'JDPowersCat_FULLSIZECAR',
 'JDPowersCat_LUXURYCAR',
 'JDPowersCat_MIDSIZECAR',
 'JDPowersCat_PICKUP',
 'JDPowersCat_SPORTSCAR',
 'JDPowersCat_SUV',
 'JDPowersCat_UNKNOWN',
 'JDPowersCat_VAN',
 'JDPowersCat_fracDSEligible1DivTransactions',
 'JDPowersCat_fracReturned1DivReturnedNotNull',
 'JDPowersCat_fracReturnedNotNullDivDSEligible1',
 'JDPowersCat_fracReturnedasm1DivTransactions',
 'JDPowersCat_numDSEligible1',
 'JDPowersCat_numReturned1',
 'JDPowersCat_numReturnedNotNull',
 'JDPowersCat_numReturnedasm1',
 'JDPowersCat_numTransactions',
 'LIGHTG',
 'LIGHTR',
 'LIGHTY',
 'LIGHT_N0G1Y2R3',
 'MMR',
 'Mileage',
 'OVE',
 'PSI',
 'PSIEligible',
 'Returned_asm',
 'SaleDate_day',
 'SaleDate_decyear',
 'SaleDate_dow',
 'SaleDate_doy',
 'SalePrice',
 'Salvage',
 'SellerID_fracDSEligible1DivTransactions',
 'SellerID_fracReturned1DivReturnedNotNull',
 'SellerID_fracReturnedNotNullDivDSEligible1',
 'SellerID_fracReturnedasm1DivTransactions',
 'SellerID_numDSEligible1',
 'SellerID_numReturned1',
 'SellerID_numReturnedNotNull',
 'SellerID_numReturnedasm1',
 'SellerID_numTransactions',
 'SellingLocation_fracDSEligible1DivTransactions',
 'SellingLocation_fracReturned1DivReturnedNotNull',
 'SellingLocation_fracReturnedNotNullDivDSEligible1',
 'SellingLocation_fracReturnedasm1DivTransactions',
 'SellingLocation_lat',
 'SellingLocation_lon',
 'SellingLocation_numDSEligible1',
 'SellingLocation_numReturned1',
 'SellingLocation_numReturnedNotNull',
 'SellingLocation_numReturnedasm1',
 'SellingLocation_numTransactions',
 'Simulcast',
 'VIN_fracDSEligible1DivTransactions',
 'VIN_fracReturned1DivReturnedNotNull',
 'VIN_fracReturnedNotNullDivDSEligible1',
 'VIN_fracReturnedasm1DivTransactions',
 'VIN_numDSEligible1',
 'VIN_numReturned1',
 'VIN_numReturnedNotNull',
 'VIN_numReturnedasm1',
 'VIN_numTransactions']
In [13]:
print(textwrap.dedent("""\
    `Container`: Create an empty container class and
    dynamically allocate attributes to hold variables for specific steps
    of the pipeline. """))
Container = demo.utils.utils.Container
step = Container()

print(textwrap.dedent("""\
    `step.s0.[df,ds]_[features,target]`: Save initial state of features, target."""))
step.s0 = Container()
step.s0.dfs = Container()
step.s0.dfs.df_features = df[features].copy()
step.s0.dfs.ds_target = df[target].copy()

# TODO: REDO after this point with step.sN.dfs.[df_features,ds_target]
# rather than redefining [df_features,ds_target]
df_features = step.s0.dfs.df_features
ds_target = step.s0.dfs.ds_target
`Container`: Create an empty container class and
dynamically allocate attributes to hold variables for specific steps
of the pipeline. 
`step.s0.[df,ds]_[features,target]`: Save initial state of features, target.
In [14]:
print(textwrap.dedent("""\
    `transformer_scaler`, `transformer_pca`: Scale data
    then make groups of similar records with k-means clustering,
    both with and without PCA. Use the silhouette score to determine
    the number of clusters.
    """))
time_start = time.perf_counter()

# Scale data prior to comparing clusters with/without PCA. 
# Note: Using sklearn.preprocessing.RobustScaler with
#     sklearn.decomposition.IncrementalPCA(whiten=False)
#     is often the most stable (slowly varying scores)
#     with highest scores. Centroid agreement can still be
#     off due to outliers.
transformer_scaler = sk_pre.RobustScaler()
features_scaled = transformer_scaler.fit_transform(X=df_features)
transformer_pca = sk_dc.IncrementalPCA(whiten=False)
features_scaled_pca = transformer_pca.fit_transform(X=features_scaled)

print("`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.")
path_data = path_data_dir
path_cols = os.path.join(path_data, 'columns.pkl')
with open(path_cols, mode='wb') as fobj:
    pickle.dump(obj=df_features.columns, file=fobj)
path_tform_scl = os.path.join(path_data, 'transformer_scaler.pkl')
with open(path_tform_scl, mode='wb') as fobj:
    pickle.dump(obj=transformer_scaler, file=fobj)
path_tform_pca = os.path.join(path_data, 'transformer_pca.pkl')
with open(path_tform_pca, mode='wb') as fobj:
    pickle.dump(obj=transformer_pca, file=fobj)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    print("Plot scores for scaled features:")
    demo.utils.utils.calc_silhouette_scores(
        df_features=features_scaled, n_clusters_min=2, n_clusters_max=10,
        size_sub=None, n_scores=10, show_progress=True, show_plot=True)

    print("Plot scores for scaled PCA features:")
    demo.utils.utils.calc_silhouette_scores(
        df_features=features_scaled_pca, n_clusters_min=2, n_clusters_max=10,
        size_sub=None, n_scores=10, show_progress=True, show_plot=True)

time_stop = time.perf_counter()
print("Time elapsed (sec) = {diff:.1f}".format(diff=time_stop-time_start))
`transformer_scaler`, `transformer_pca`: Scale data
then make groups of similar records with k-means clustering,
both with and without PCA. Use the silhouette score to determine
the number of clusters.

`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
Plot scores for scaled features:
Progress: 11% 22% 33% 44% 56% 67% 78% 89% 100% 

Plot scores for scaled PCA features:
Progress: 11% 22% 33% 44% 56% 67% 78% 89% 100% 

Time elapsed (sec) = 47.4
In [15]:
print(textwrap.dedent("""\
    `transformer_kmeans`, `transformer_kmeans_pca`:
    Fit k-means to the data with/without PCA and
    compare the centroids for the clusters."""))

# TODO: Fix plot. Assign clusters IDs in a deterministic way so that
#   cluster 0 raw matches cluster 0 transformed.
time_start = time.perf_counter()

n_clusters = 2 # from silhouette scores

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    # Cluster scaled features with/without PCA using minibatch k-means
    transformer_kmeans = sk_cl.MiniBatchKMeans(n_clusters=n_clusters)
    transformer_kmeans.fit(X=features_scaled)
    transformer_kmeans_pca = sk_cl.MiniBatchKMeans(n_clusters=n_clusters)
    transformer_kmeans_pca.fit(X=features_scaled_pca)

print("`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.")
path_tform_km = os.path.join(path_data, 'transformer_kmeans.pkl')
with open(path_tform_km, mode='wb') as fobj:
    pickle.dump(obj=transformer_kmeans, file=fobj)
path_tform_km_pca = os.path.join(path_data, 'transformer_kmeans_pca.pkl')
with open(path_tform_km_pca, mode='wb') as fobj:
    pickle.dump(obj=transformer_kmeans_pca, file=fobj)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    # Plot clusters in scaled feature space.
    centroids = transformer_kmeans.cluster_centers_
    transformed_centroids = transformer_pca.inverse_transform(transformer_kmeans_pca.cluster_centers_)
    (col_1, col_0) = np.argsort(np.var(features_scaled, axis=0))[-2:]
    (name_1, name_0) = (df_features.columns.values[col_1], df_features.columns.values[col_0])
    plt.title("Data and centroids within scaled feature space")
    tfmask_gt01 = df_features[buyer_retrate] > buyer_retrate_max
    plt.plot(features_scaled[tfmask_gt01, col_0], features_scaled[tfmask_gt01, col_1],
             marker='o', linestyle='', color=sns.color_palette()[2], alpha=0.5,
             label='data, buyer_retrate_gt01')
    tfmask_lt01 = np.logical_not(tfmask_gt01)
    plt.plot(features_scaled[tfmask_lt01, col_0], features_scaled[tfmask_lt01, col_1],
             marker='.', linestyle='', color=sns.color_palette()[1], alpha=0.5,
             label='data, buyer_retrate_lt01')
    plt.plot(centroids[:, col_0], centroids[:, col_1],
             marker='+', linestyle='', markeredgewidth=2, markersize=12,
             color=sns.color_palette()[0], label='centroids')
    for (idx, centroid) in enumerate(centroids):
        plt.annotate(
            str(idx), xy=(centroid[col_0], centroid[col_1]),
            xycoords='data', xytext=(0, 0), textcoords='offset points', color='black',
            fontsize=18, rotation=0)
    plt.plot(transformed_centroids[:, col_0], transformed_centroids[:, col_1],
             marker='x', linestyle='', markeredgewidth=2, markersize=10,
             color=sns.color_palette()[1], label='transformed centroids')
    for (idx, transformed_centroid) in enumerate(transformed_centroids):
        plt.annotate(
            str(idx), xy=(transformed_centroid[col_0], transformed_centroid[col_1]),
            xycoords='data', xytext=(0, 0), textcoords='offset points', color='black',
            fontsize=18, rotation=0)
    plt.xlabel("Scaled '{name}', highest variance".format(name=name_0))
    plt.ylabel("Scaled '{name}', next highest variance".format(name=name_1))
    plt.legend(loc='upper left')
    plt.show()

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    # Plot clusters in scaled feature PCA space.
    transformed_centroids = transformer_pca.transform(transformer_kmeans.cluster_centers_)
    centroids = transformer_kmeans_pca.cluster_centers_
    plt.title("Data and centroids within scaled feature PCA space")
    plt.plot(features_scaled_pca[tfmask_gt01, 0], features_scaled_pca[tfmask_gt01, 1],
             marker='o', linestyle='', color=sns.color_palette()[2], alpha=0.5,
             label='transformed data, buyer_retrate_gt01')
    plt.plot(features_scaled_pca[tfmask_lt01, 0], features_scaled_pca[tfmask_lt01, 1],
             marker='.', linestyle='', color=sns.color_palette()[1], alpha=0.5,
             label='transformed data, buyer_retrate_lt01')
    plt.plot(transformed_centroids[:, 0], transformed_centroids[:, 1],
             marker='+', linestyle='', markeredgewidth=2, markersize=12,
             color=sns.color_palette()[0], label='transformed centroids')
    for (idx, transformed_centroid) in enumerate(transformed_centroids):
        plt.annotate(
            str(idx), xy=(transformed_centroid[0], transformed_centroid[1]),
            xycoords='data', xytext=(0, 0), textcoords='offset points', color='black',
            fontsize=18, rotation=0)
    plt.plot(centroids[:, 0], centroids[:, 1],
             marker='x', linestyle='', markeredgewidth=2, markersize=10,
             color=sns.color_palette()[1], label='centroids')
    for (idx, centroid) in enumerate(centroids):
        plt.annotate(
            str(idx), xy=(centroid[0], centroid[1]),
            xycoords='data', xytext=(0, 0), textcoords='offset points', color='black',
            fontsize=18, rotation=0)
    plt.xlabel('Principal component 0')
    plt.ylabel('Principal component 1')
    plt.legend(loc='upper left')
    plt.show()

time_stop = time.perf_counter()
print("Time elapsed (sec) = {diff:.1f}".format(diff=time_stop-time_start))
`transformer_kmeans`, `transformer_kmeans_pca`:
Fit k-means to the data with/without PCA and
compare the centroids for the clusters.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
Time elapsed (sec) = 3.0
In [16]:
print(textwrap.dedent("""\
    `df_features2`: Combine `df_features` with
    cluster labels, cluster distances, PCA components, PCA cluster labels,
    and PCA cluster distances into `df_features`."""))
time_start = time.perf_counter()

# Cluster labels and distances in feature space.
ds_clusters = pd.Series(
    transformer_kmeans.predict(X=features_scaled),
    index=df_features.index, name='cluster')
n_digits = len(str(len(transformer_kmeans.cluster_centers_)))
columns = [
    'cluster_{num}_dist'.format(num=str(num).rjust(n_digits, '0'))
    for num in range(len(transformer_kmeans.cluster_centers_))]
df_cluster_dists = pd.DataFrame(
    transformer_kmeans.transform(X=features_scaled),
    index=df_features.index, columns=columns)
if not np.all(ds_clusters.values == np.argmin(df_cluster_dists.values, axis=1)):
    raise AssertionError(
        ("Program error. Not all cluster labels match cluster label\n" +
         "with minimum distance to record.\n" +
         "Required: np.all(ds_clusters.values == np.argmin(df_cluster_dists.values, axis=1))"))

# PCA features.
n_digits = len(str(transformer_pca.n_components_))
columns = [
    'pca_comp_{num}'.format(num=str(num).rjust(n_digits, '0'))
    for num in range(transformer_pca.n_components_)]
df_features_pca = pd.DataFrame(
    features_scaled_pca, index=df_features.index, columns=columns)

# Cluster labels and distances in PCA feature space.
ds_clusters_pca = pd.Series(
    transformer_kmeans_pca.predict(X=features_scaled_pca),
    index=df_features.index, name='pca_cluster')
n_digits = len(str(len(transformer_kmeans_pca.cluster_centers_)))
columns = [
    'pca_cluster_{num}_dist'.format(num=str(num).rjust(n_digits, '0'))
    for num in range(len(transformer_kmeans_pca.cluster_centers_))]
df_cluster_dists_pca = pd.DataFrame(
    transformer_kmeans_pca.transform(X=features_scaled_pca),
    index=df_features.index, columns=columns)
if not np.all(ds_clusters_pca.values == np.argmin(df_cluster_dists_pca.values, axis=1)):
    raise AssertionError(
        ("Program error. Not all PCA cluster labels match PCA cluster label\n" +
         "with minimum distance to record.\n" +
         "Required: np.all(ds_clusters_pca.values == np.argmin(df_cluster_dists_pca.values, axis=1))"))

# Combine with original `df_features` into new `df_features2`.
df_features2 = pd.concat(
    [df_features, ds_clusters, df_cluster_dists,
     df_features_pca, ds_clusters_pca, df_cluster_dists_pca],
    axis=1, copy=True)

time_stop = time.perf_counter()
print("Time elapsed (sec) = {diff:.1f}".format(diff=time_stop-time_start))
`df_features2`: Combine `df_features` with
cluster labels, cluster distances, PCA components, PCA cluster labels,
and PCA cluster distances into `df_features`.
Time elapsed (sec) = 1.3
In [17]:
print(textwrap.dedent("""\
    `df_importances` , `important_features`, `df_features3`:
    `df_features3` is a view into (not a copy) of `df_features2` with only
    `important_features`. Feature importance is the normalized reduction
    in the loss score. A feature is selected as 'important' if its average
    importance is greater than the average importance of the random feature."""))
time_start = time.perf_counter()

# Calculate feature importances.
# Note:
# * `n_estimators` impact the feature importances but only have a small
#     effect on the relative importances.
# * `n_estimators` impact the scores but only have a small effect on the relative scores.
# * Use replace=False for maximum data variety.
# TODO: Use a significance test for feature importance.
estimator = sk_ens.ExtraTreesRegressor(n_estimators=10, n_jobs=-1)
df_importances = demo.utils.utils.calc_feature_importances(
    estimator=estimator, df_features=df_features2, ds_target=ds_target,
    replace=False, show_progress=True, show_plot=True)
important_features = df_importances.columns[
    df_importances.mean() > df_importances['random'].mean()]
important_features = list(
    df_importances[important_features].mean().sort_values(ascending=False).index)
df_features3 = df_features2[important_features]
print("`important_features` =")
print(important_features)
print()

time_stop = time.perf_counter()
print("Time elapsed (sec) = {diff:.1f}".format(diff=time_stop-time_start))
`df_importances` , `important_features`, `df_features3`:
`df_features3` is a view into (not a copy) of `df_features2` with only
`important_features`. Feature importance is the normalized reduction
in the loss score. A feature is selected as 'important' if its average
importance is greater than the average importance of the random feature.
Progress: 10% 20% 30% 40% 50% 60% 70% 80% 90% 100% 

`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'Returned_asm', 'VIN_numReturnedNotNull', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_78', 'VIN_fracReturnedasm1DivTransactions', 'pca_comp_72', 'pca_comp_76', 'pca_comp_75', 'pca_comp_73', 'DSEligible', 'VIN_fracDSEligible1DivTransactions', 'VIN_numReturnedasm1', 'VIN_numReturned1', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_03', 'pca_comp_57', 'pca_comp_01', 'pca_comp_31', 'pca_comp_49', 'pca_comp_50', 'BuyerID_numReturnedNotNull', 'pca_comp_12', 'pca_comp_02', 'VIN_numTransactions', 'pca_comp_68', 'BuyerID_fracDSEligible1DivTransactions', 'SellerID_fracReturnedNotNullDivDSEligible1', 'Mileage', 'pca_comp_54', 'pca_cluster', 'VIN_numDSEligible1', 'BuyerID_numReturned1', 'pca_comp_06', 'pca_comp_38', 'pca_comp_47', 'pca_comp_09', 'pca_comp_28', 'pca_comp_04', 'pca_comp_25', 'pca_comp_61', 'SalePrice', 'BuyerID_numTransactions', 'SellingLocation_numReturnedasm1', 'SellingLocation_numReturnedNotNull', 'CarMake_fracReturnedasm1DivTransactions', 'BuyerID_fracReturned1DivReturnedNotNull', 'SellerID_numDSEligible1', 'cluster_0_dist', 'pca_comp_41', 'SellerID_fracReturned1DivReturnedNotNull', 'pca_comp_48', 'JDPowersCat_numReturned1', 'CarMake_numReturned1', 'pca_comp_08', 'pca_comp_66', 'pca_comp_43', 'CarMake_fracReturnedNotNullDivDSEligible1', 'pca_comp_37']

Time elapsed (sec) = 9.9
In [18]:
print("`df_features`: Most significant projections of PCA component 78:")
print(sorted(list(zip(df_features, transformer_pca.components_[78])), key=lambda tup: tup[1])[:3])
print('...')
print(sorted(list(zip(df_features, transformer_pca.components_[78])), key=lambda tup: tup[1])[-3:])
`df_features`: Most significant projections of PCA component 78:
[('DSEligible', -0.39888862828646721), ('VIN_numReturned1', -0.16018987870664869), ('Returned_asm', -0.15564636661469736)]
...
[('VIN_numTransactions', 0.094004552775801192), ('VIN_fracReturnedasm1DivTransactions', 0.39932190389544714), ('VIN_fracDSEligible1DivTransactions', 0.76826145356509656)]
In [19]:
print(textwrap.dedent("""\
    Tune feature space by optimizing the model score
    with cross validation. Model scores are R^2,
    the coefficient of determination."""))
time_start = time.perf_counter()

print("Progress:", end=' ')
size_data = len(df_features3)
size_sub = 1000
frac_test = 0.2
replace = False
n_scores = 10
estimator = sk_ens.ExtraTreesRegressor(n_estimators=10, n_jobs=-1)
nftrs_scores = list()
idxs = itertools.chain(range(0, 20), range(20, 50, 2), range(50, len(important_features), 5))
for idx in idxs:
    n_ftrs = idx+1
    ftrs = important_features[:n_ftrs]
    scores = list()
    for _ in range(0, n_scores):
        idxs_sub = np.random.choice(a=size_data, size=size_sub, replace=replace)
        (ftrs_train, ftrs_test,
         trg_train, trg_test) = sk_cv.train_test_split(
            df_features3[ftrs].values[idxs_sub], ds_target.values[idxs_sub],
            test_size=frac_test)
        estimator.fit(X=ftrs_train, y=trg_train)
        scores.append(estimator.score(X=ftrs_test, y=trg_test))
    nftrs_scores.append([n_ftrs, scores])
    if idx % 10 == 0:
        print("{frac:.0%}".format(frac=(idx+1)/len(important_features)), end=' ')
print('\n')

nftrs_pctls = np.asarray(
    [np.append(tup[0], np.percentile(tup[1], q=[5,50,95]))
     for tup in nftrs_scores])
plt.plot(
    nftrs_pctls[:, 0], nftrs_pctls[:, 2],
    marker='.', color=sns.color_palette()[0],
    label='50th pctl score')
plt.fill_between(
    nftrs_pctls[:, 0],
    y1=nftrs_pctls[:, 1],
    y2=nftrs_pctls[:, 3],
    alpha=0.5, color=sns.color_palette()[0],
    label='5-95th pctls of scores')
plt.title("Model score vs number of features")
plt.xlabel("Number of features")
plt.ylabel("Model score")
plt.legend(loc='upper left')
plt.show()

time_stop = time.perf_counter()
print("Time elapsed (sec) = {diff:.1f}".format(diff=time_stop-time_start))
Tune feature space by optimizing the model score
with cross validation. Model scores are R^2,
the coefficient of determination.
Progress: 2% 19% 36% 53% 69% 86% 

Time elapsed (sec) = 95.5
In [20]:
print("""`important_features2`, `df_features4`:
`df_features4` is a view into (not a copy) of `df_features3` with only
`important_features2`. Feature importance is the normalized reduction
in the loss score. A feature is selected as 'important' from the
model score vs features plot.
""")
time_start = time.perf_counter()

# Keep top 10 features from score vs features plot.
important_features2 = important_features[:10]

df_features4 = df_features3[important_features2]
print("`important_features2` =")
print(important_features2)
print()

print("""Cluster map of important feature correlations with heirarchical relationships.
The deeper of the dendrogram node, the higher (anti)correlated the features are.
The Spearman rank correlation accommodates non-linear features.
The pair plot is a scatter matrix plot of columns vs each other.
""")
path_plot_dir = os.path.join(path_data_dir, 'plot_model')

# Notes:
# * `size_sub` for computing correlations should be <= 1e3 else runtime is long.
# * Use replace=False to show most data variety.
# * For pairplot, only plot the target variable with the top 5 important
#     features for legibility.
# * For clustermap, `nlabels` shows every `nlabels`th label, so 20 labels total.
size_sub = min(int(1e3), len(df_features4.index))
idxs_sub = np.random.choice(a=df_features4.index, size=size_sub, replace=False)
df_plot_sub = df_features4.loc[idxs_sub].copy()
df_plot_sub[target] = ds_target.loc[idxs_sub].copy()
df_plot_sub['buyer_retrate_gt01'] = df_features3.loc[idxs_sub, buyer_retrate] > buyer_retrate_max

print(("Clustermap of target, '{target}', top 10 important features, buyer_retrate_gt01:").format(
        target=target))
sns.clustermap(df_plot_sub[[target]+important_features2[:10]+['buyer_retrate_gt01']].corr(method='spearman'))
plt.savefig(
    os.path.join(path_plot_dir, 'model_clustermap.png'),
    bbox_inches='tight', dpi=300)
plt.show()

print(("Pairplot of target, '{target}', top 5 important features, buyer_retrate_gt01:").format(
        target=target))
df_pairplot = df_plot_sub[[target]+important_features2[:5]+['buyer_retrate_gt01']]
print(df_pairplot.columns)
ds_columns = pd.Series(df_pairplot.columns, name='column')
ds_columns.to_csv(
    os.path.join(path_plot_dir, 'model_pairplot_index_column_map.csv'),
    header=True, index_label='index')
df_pairplot.columns = ds_columns.index
df_pairplot.loc[:, target] = df_pairplot[np.where(ds_columns.values == target)[0][0]]
df_pairplot.loc[:, 'buyer_retrate_gt01'] = df_pairplot[np.where(ds_columns.values == 'buyer_retrate_gt01')[0][0]]
df_pairplot.drop([np.where(ds_columns.values == target)[0][0]], axis=1, inplace=True)
df_pairplot.drop([np.where(ds_columns.values == 'buyer_retrate_gt01')[0][0]], axis=1, inplace=True)
sns.pairplot(
    df_pairplot,
    hue='buyer_retrate_gt01', diag_kind='hist', markers=['.', 'o'],
    palette=[sns.color_palette()[1], sns.color_palette()[2]],
    plot_kws={'alpha':1.0})
plt.savefig(
    os.path.join(path_plot_dir, 'model_pairplot.png'),
    bbox_inches='tight', dpi=300)
plt.show()

print("Summarize top 5 important features:")
print(df_features4[important_features2[:5]].describe(percentiles=percentiles, include='all'))
print()
print("First 5 records for top 5 important features:")
print(df_features4[important_features2[:5]].head())
print()
print("""Describe top 5 important features. Format:
Feature: importance score.
Histogram of feature values.""")
cols_scores = df_importances[important_features2[:5]].mean().items()
for (col, score) in cols_scores:
    # Describe feature variables.
    print(
        ("{col}:\n" +
         "    importance: {score:.3f}").format(col=col, score=score))
    # Plot histogram of feature variables.
    tfmask_gt01 = df_features3[buyer_retrate] > buyer_retrate_max
    sns.distplot(
        df_features4.loc[np.logical_not(tfmask_gt01), col], hist=True, kde=False, norm_hist=False,
        label='buyer_retrate_lt01', color=sns.color_palette()[1])
    sns.distplot(
        df_features4.loc[tfmask_gt01, col], hist=True, kde=False, norm_hist=False,
        label='buyer_retrate_gt01', color=sns.color_palette()[2])
    plt.title('Feature value histogram')
    plt.xlabel("Feature value, '{ftr}'".format(ftr=col))
    plt.ylabel('Number of feature values')
    plt.legend(loc='upper left')
    plt.show()

time_stop = time.perf_counter()
print("Time elapsed (sec) = {diff:.1f}".format(diff=time_stop-time_start))
`important_features2`, `df_features4`:
`df_features4` is a view into (not a copy) of `df_features3` with only
`important_features2`. Feature importance is the normalized reduction
in the loss score. A feature is selected as 'important' from the
model score vs features plot.

`important_features2` =
['VIN_fracReturnedNotNullDivDSEligible1', 'Returned_asm', 'VIN_numReturnedNotNull', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_78', 'VIN_fracReturnedasm1DivTransactions', 'pca_comp_72', 'pca_comp_76', 'pca_comp_75', 'pca_comp_73']

Cluster map of important feature correlations with heirarchical relationships.
The deeper of the dendrogram node, the higher (anti)correlated the features are.
The Spearman rank correlation accommodates non-linear features.
The pair plot is a scatter matrix plot of columns vs each other.

Clustermap of target, 'Returned', top 10 important features, buyer_retrate_gt01:
Pairplot of target, 'Returned', top 5 important features, buyer_retrate_gt01:
Index(['Returned', 'VIN_fracReturnedNotNullDivDSEligible1', 'Returned_asm',
       'VIN_numReturnedNotNull', 'VIN_fracReturned1DivReturnedNotNull',
       'pca_comp_78', 'buyer_retrate_gt01'],
      dtype='object')
/opt/conda/lib/python3.5/site-packages/pandas/core/indexing.py:288: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
/opt/conda/lib/python3.5/site-packages/pandas/core/indexing.py:465: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:55: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:56: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1   Returned_asm  \
count                          278337.000000  278337.000000   
mean                                0.072437       0.070095   
std                                 0.254448       0.255307   
min                                 0.000000       0.000000   
5%                                  0.000000       0.000000   
15.9%                               0.000000       0.000000   
50%                                 0.000000       0.000000   
84.1%                               0.000000       0.000000   
95%                                 1.000000       1.000000   
max                                 1.000000       1.000000   

       VIN_numReturnedNotNull  VIN_fracReturned1DivReturnedNotNull  \
count           278337.000000                        278337.000000   
mean                 0.079350                             0.020815   
std                  0.277162                             0.141990   
min                  0.000000                             0.000000   
5%                   0.000000                             0.000000   
15.9%                0.000000                             0.000000   
50%                  0.000000                             0.000000   
84.1%                0.000000                             0.000000   
95%                  1.000000                             0.000000   
max                  4.000000                             1.000000   

        pca_comp_78  
count  2.783370e+05  
mean   2.431927e-17  
std    8.707895e-03  
min   -2.556930e-01  
5%    -3.498293e-03  
15.9% -5.717638e-04  
50%    3.778332e-05  
84.1%  5.294729e-04  
95%    1.209417e-03  
max    2.741183e-01  

First 5 records for top 5 important features:
   VIN_fracReturnedNotNullDivDSEligible1  Returned_asm  \
0                                    0.0             0   
1                                    0.0             0   
2                                    0.0             0   
3                                    0.0             0   
4                                    0.0             0   

   VIN_numReturnedNotNull  VIN_fracReturned1DivReturnedNotNull  pca_comp_78  
0                       0                                  0.0     0.000569  
1                       0                                  0.0     0.000502  
2                       0                                  0.0     0.001204  
3                       0                                  0.0     0.000335  
4                       0                                  0.0     0.001024  

Describe top 5 important features. Format:
Feature: importance score.
Histogram of feature values.
VIN_fracReturnedNotNullDivDSEligible1:
    importance: 0.758
Returned_asm:
    importance: 0.109
VIN_numReturnedNotNull:
    importance: 0.075
VIN_fracReturned1DivReturnedNotNull:
    importance: 0.016
pca_comp_78:
    importance: 0.009
Time elapsed (sec) = 22.7
In [21]:
print("""Tune model hyperparameters by optimizing the model score
with cross validation. Model scores are R^2,
the coefficient of determination.
""")
time_start = time.perf_counter()

print("Progress:", end=' ')
size_data = len(df_features4)
size_sub = min(len(df_features4), int(2e3))
frac_test = 0.2
replace = False
nest_list = [10, 30, 100, 300]
n_scores = 10
nest_scores = list()
for (inum, n_est) in enumerate(nest_list):
    estimator = sk_ens.ExtraTreesRegressor(n_estimators=n_est, n_jobs=-1)
    scores = list()
    for _ in range(0, n_scores):
        idxs_sub = np.random.choice(a=size_data, size=size_sub, replace=replace)
        (ftrs_train, ftrs_test,
         trg_train, trg_test) = sk_cv.train_test_split(
            df_features4.values[idxs_sub], ds_target.values[idxs_sub],
            test_size=frac_test)
        estimator.fit(X=ftrs_train, y=trg_train)
        scores.append(estimator.score(
                X=ftrs_test, y=trg_test))
    nest_scores.append([n_est, scores])
    print("{frac:.0%}".format(frac=(inum+1)/len(nest_list)), end=' ')
print('\n')

nest_pctls = np.asarray(
    [np.append(tup[0], np.percentile(tup[1], q=[5,50,95]))
     for tup in nest_scores])
plt.plot(
    nest_pctls[:, 0], nest_pctls[:, 2],
    marker='.', color=sns.color_palette()[0],
    label='50th pctl score')
plt.fill_between(
    nest_pctls[:, 0],
    y1=nest_pctls[:, 1],
    y2=nest_pctls[:, 3],
    alpha=0.5, color=sns.color_palette()[0],
    label='5-95th pctls of scores')
plt.title("Model score vs number of estimators")
plt.xlabel("Number of estimators")
plt.ylabel("Model score")
plt.legend(loc='lower left')
plt.show()

time_stop = time.perf_counter()
print("Time elapsed (sec) = {diff:.1f}".format(diff=time_stop-time_start))
Tune model hyperparameters by optimizing the model score
with cross validation. Model scores are R^2,
the coefficient of determination.

Progress: 25% 50% 75% 100% 

Time elapsed (sec) = 11.4
In [22]:
print("""Test significance of predictions by shuffling the target values.
Model scores are r^2, the coefficient of determination.
""")
n_estimators = 50 # from tuning curve
time_start = time.perf_counter()

# Calculate significance of score.
estimator = sk_ens.ExtraTreesRegressor(n_estimators=n_estimators, n_jobs=-1)
demo.utils.utils.calc_score_pvalue(
    estimator=estimator, df_features=df_features4, ds_target=ds_target,
    n_iter=20, size_sub=None, frac_test=0.2,
    replace=False, show_progress=True, show_plot=True)
print()

time_stop = time.perf_counter()
print("Time elapsed (sec) = {diff:.1f}".format(diff=time_stop-time_start))
Test significance of predictions by shuffling the target values.
Model scores are r^2, the coefficient of determination.

Progress: 5% 25% 45% 65% 85% 

/opt/conda/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Average model score with shuffling: -0.116
Average model score without shuffling: 0.991
Null hypothesis: There is no distinction in the differences
between the mean model scores whether or not the target
values have been shuffled.
Outcome: Assuming the null hypothesis, the probability of
obtaining a difference between the mean model scores at least
as great as 1.11 is 0.0%.

Time elapsed (sec) = 11.8
In [23]:
print("""Predict target values with cross-validation,
plot actual vs predicted and score.
""")
n_estimators = 50 # from tuning curve
time_start = time.perf_counter()

print("Progress:", end=' ')
n_folds = 5
estimator = sk_ens.ExtraTreesRegressor(n_estimators=n_estimators, n_jobs=-1)
kfolds = sk_cv.KFold(n=len(df_features4), n_folds=n_folds, shuffle=True)
ds_predicted = pd.Series(index=ds_target.index, name=target+'_pred')
idxs_pred = set()
for (inum, (idxs_train, idxs_test)) in enumerate(kfolds):
    if not idxs_pred.isdisjoint(idxs_test):
        raise AssertionError(
            ("Program error. Each record must be predicted only once.\n" +
             "Required: idxs_pred.isdisjoint(idxs_test)"))
    idxs_pred.update(idxs_test)
    ftrs_train = df_features4.values[idxs_train]
    ftrs_test  = df_features4.values[idxs_test]
    trg_train  = ds_target.values[idxs_train]
    trg_test   = ds_target.values[idxs_test]
    estimator.fit(X=ftrs_train, y=trg_train)
    ds_predicted.iloc[idxs_test] = estimator.predict(X=ftrs_test)
    print("{frac:.0%}".format(frac=(inum+1)/n_folds), end=' ')
print('\n')

score = sk_met.r2_score(
    y_true=ds_target, y_pred=ds_predicted)
print("Model score = {score:.3f}".format(score=score))
path_plot_dir = os.path.join(path_data_dir, 'plot_model')
demo.utils.utils.plot_actual_vs_predicted(
    y_true=ds_target.values, y_pred=ds_predicted.values,
    loglog=False, xylims=(-1.1, 1.1),
    path=os.path.join(path_plot_dir, 'model_actual_vs_predicted.jpg'))

print("""`features.pkl`, `estimator.pkl`: Save features and estimator.
""")
path_ftr = os.path.join(path_data, 'features.pkl')
with open(path_ftr, mode='wb') as fobj:
    pickle.dump(obj=df_features4.columns, file=fobj)
path_est = os.path.join(path_data, 'estimator.pkl')
with open(path_est, mode='wb') as fobj:
    pickle.dump(obj=estimator, file=fobj)

time_stop = time.perf_counter()
print("Time elapsed (sec) = {diff:.1f}".format(diff=time_stop-time_start))
Predict target values with cross-validation,
plot actual vs predicted and score.

Progress: 20% 40% 60% 80% 100% 

Model score = 0.995
`features.pkl`, `estimator.pkl`: Save features and estimator.

Time elapsed (sec) = 7.4
In [60]:
demo.app_predict.predict.create_pipeline_model(df=df, path_data_dir=path_data_dir, show_plots=False)
################################################################################
Features:
['Arbitrated', 'Autocheck_score', 'BuyerID_fracDSEligible1DivTransactions', 'BuyerID_fracReturned1DivReturnedNotNull', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'BuyerID_fracReturnedasm1DivTransactions', 'BuyerID_numDSEligible1', 'BuyerID_numReturned1', 'BuyerID_numReturnedNotNull', 'BuyerID_numReturnedasm1', 'BuyerID_numTransactions', 'CarMake_fracDSEligible1DivTransactions', 'CarMake_fracReturned1DivReturnedNotNull', 'CarMake_fracReturnedNotNullDivDSEligible1', 'CarMake_fracReturnedasm1DivTransactions', 'CarMake_numDSEligible1', 'CarMake_numReturned1', 'CarMake_numReturnedNotNull', 'CarMake_numReturnedasm1', 'CarMake_numTransactions', 'CarYear', 'ConditionReport', 'DSEligible', 'InLane', 'JDPowersCat_COMPACTCAR', 'JDPowersCat_EXCLUDED', 'JDPowersCat_FULLSIZECAR', 'JDPowersCat_LUXURYCAR', 'JDPowersCat_MIDSIZECAR', 'JDPowersCat_PICKUP', 'JDPowersCat_SPORTSCAR', 'JDPowersCat_SUV', 'JDPowersCat_UNKNOWN', 'JDPowersCat_VAN', 'JDPowersCat_fracDSEligible1DivTransactions', 'JDPowersCat_fracReturned1DivReturnedNotNull', 'JDPowersCat_fracReturnedNotNullDivDSEligible1', 'JDPowersCat_fracReturnedasm1DivTransactions', 'JDPowersCat_numDSEligible1', 'JDPowersCat_numReturned1', 'JDPowersCat_numReturnedNotNull', 'JDPowersCat_numReturnedasm1', 'JDPowersCat_numTransactions', 'LIGHTG', 'LIGHTR', 'LIGHTY', 'LIGHT_N0G1Y2R3', 'MMR', 'Mileage', 'OVE', 'PSI', 'PSIEligible', 'Returned_asm', 'SaleDate_day', 'SaleDate_decyear', 'SaleDate_dow', 'SaleDate_doy', 'SalePrice', 'Salvage', 'SellerID_fracDSEligible1DivTransactions', 'SellerID_fracReturned1DivReturnedNotNull', 'SellerID_fracReturnedNotNullDivDSEligible1', 'SellerID_fracReturnedasm1DivTransactions', 'SellerID_numDSEligible1', 'SellerID_numReturned1', 'SellerID_numReturnedNotNull', 'SellerID_numReturnedasm1', 'SellerID_numTransactions', 'SellingLocation_fracDSEligible1DivTransactions', 'SellingLocation_fracReturned1DivReturnedNotNull', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'SellingLocation_fracReturnedasm1DivTransactions', 'SellingLocation_lat', 'SellingLocation_lon', 'SellingLocation_numDSEligible1', 'SellingLocation_numReturned1', 'SellingLocation_numReturnedNotNull', 'SellingLocation_numReturnedasm1', 'SellingLocation_numTransactions', 'Simulcast', 'VIN_fracDSEligible1DivTransactions', 'VIN_fracReturned1DivReturnedNotNull', 'VIN_fracReturnedNotNullDivDSEligible1', 'VIN_fracReturnedasm1DivTransactions', 'VIN_numDSEligible1', 'VIN_numReturned1', 'VIN_numReturnedNotNull', 'VIN_numReturnedasm1', 'VIN_numTransactions']

################################################################################
`Container`: Create an empty container class and
dynamically allocate attributes to hold variables for specific steps
of the pipeline. 
`step.s0.[df,ds]_[features,target]`: Save initial state of features, target.

################################################################################
`transformer_scaler`, `transformer_pca`: Scale data
then make groups of similar records with k-means clustering,
both with and without PCA. Use the silhouette score to determine
the number of clusters.

`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
Plot scores for scaled features:
Progress: 11% 22% 33% 44% 56% 67% 78% 89% 100% 

Plot scores for scaled PCA features:
Progress: 11% 22% 33% 44% 56% 67% 78% 89% 100% 

Time elapsed (sec) = 51.5

################################################################################
`transformer_kmeans`, `transformer_kmeans_pca`:
Fit k-means to the data with/without PCA and
compare the centroids for the clusters.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
Time elapsed (sec) = 1.5

################################################################################
`df_features2`: Combine `df_features` with
cluster labels, cluster distances, PCA components, PCA cluster labels,
and PCA cluster distances into `df_features`.
Time elapsed (sec) = 1.5

################################################################################
`df_importances` , `important_features`, `df_features3`:
`df_features3` is a view into (not a copy) of `df_features2` with only
`important_features`. Feature importance is the normalized reduction
in the loss score. A feature is selected as 'important' if its average
importance is greater than the average importance of the random feature.
Progress: 10% 20% 30% 40% 50% 60% 70% 80% 90% 100% 

`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'Returned_asm', 'VIN_numReturnedNotNull', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_78', 'VIN_fracReturnedasm1DivTransactions', 'DSEligible', 'pca_comp_75', 'pca_comp_73', 'pca_comp_76', 'pca_comp_72', 'VIN_numReturned1', 'VIN_fracDSEligible1DivTransactions', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedasm1', 'pca_comp_68', 'VIN_numDSEligible1', 'pca_comp_03', 'pca_comp_49', 'pca_comp_57', 'pca_comp_02', 'LIGHTY', 'pca_comp_01', 'Arbitrated', 'pca_cluster_0_dist', 'BuyerID_numReturnedNotNull', 'CarMake_fracReturned1DivReturnedNotNull', 'SalePrice', 'Mileage', 'pca_comp_36', 'pca_comp_50', 'pca_comp_33', 'pca_comp_38', 'pca_comp_06', 'MMR', 'pca_comp_40', 'pca_comp_10', 'VIN_numTransactions', 'CarMake_fracReturnedNotNullDivDSEligible1', 'pca_comp_12', 'pca_comp_17', 'JDPowersCat_numDSEligible1', 'cluster_0_dist', 'pca_cluster', 'pca_comp_77', 'pca_comp_31', 'BuyerID_fracDSEligible1DivTransactions', 'SellerID_fracReturnedasm1DivTransactions', 'pca_comp_65', 'pca_comp_04', 'pca_comp_32', 'pca_comp_11', 'pca_comp_44', 'CarYear', 'SellingLocation_lat', 'pca_comp_62', 'pca_comp_18', 'JDPowersCat_numReturnedNotNull', 'pca_comp_48', 'BuyerID_numDSEligible1', 'BuyerID_fracReturned1DivReturnedNotNull', 'pca_comp_30', 'pca_comp_63', 'pca_comp_00', 'pca_comp_47', 'SellingLocation_lon', 'pca_comp_64', 'pca_cluster_1_dist', 'cluster', 'pca_comp_51', 'BuyerID_numReturnedasm1', 'pca_comp_54', 'pca_comp_53', 'pca_comp_67', 'pca_comp_69', 'Autocheck_score', 'pca_comp_19', 'pca_comp_66', 'JDPowersCat_fracReturned1DivReturnedNotNull', 'pca_comp_22', 'pca_comp_70', 'pca_comp_28', 'pca_comp_56']

Time elapsed (sec) = 11.9
`df_features`: Most significant projections of PCA component 78:
[('DSEligible', -0.39888862828646721), ('VIN_numReturned1', -0.16018987870664869), ('Returned_asm', -0.15564636661469736)]
...
[('VIN_numTransactions', 0.094004552775801192), ('VIN_fracReturnedasm1DivTransactions', 0.39932190389544714), ('VIN_fracDSEligible1DivTransactions', 0.76826145356509656)]

################################################################################
Tune feature space by optimizing the model score
with cross validation. Model scores are R^2,
the coefficient of determination.
Progress: 1% 13% 37% 49% 61% 73% 86% 98% 

Time elapsed (sec) = 61.3

################################################################################
`important_features2`, `df_features4`:
    `df_features4` is a view into (not a copy) of `df_features3` with only
    `important_features2`. Feature importance is the normalized reduction
    in the loss score. A feature is selected as 'important' from the
    model score vs features plot.
    
`important_features2` =
['VIN_fracReturnedNotNullDivDSEligible1', 'Returned_asm', 'VIN_numReturnedNotNull', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_78', 'VIN_fracReturnedasm1DivTransactions', 'DSEligible', 'pca_comp_75', 'pca_comp_73', 'pca_comp_76']

Cluster map of important feature correlations with heirarchical relationships.
    The deeper of the dendrogram node, the higher (anti)correlated the features are.
    The Spearman rank correlation accommodates non-linear features.
    The pair plot is a scatter matrix plot of columns vs each other.
    
Clustermap of target, 'Returned', top 10 important features, buyer_retrate_gt01:
Pairplot of target, 'Returned', top 5 important features, buyer_retrate_gt01:
Index(['Returned', 'VIN_fracReturnedNotNullDivDSEligible1', 'Returned_asm',
       'VIN_numReturnedNotNull', 'VIN_fracReturned1DivReturnedNotNull',
       'pca_comp_78', 'buyer_retrate_gt01'],
      dtype='object')
/opt/conda/lib/python3.5/site-packages/pandas/core/indexing.py:288: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
/opt/conda/lib/python3.5/site-packages/pandas/core/indexing.py:465: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
/opt/demo/demo/app_predict/predict.py:1836: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_pairplot.drop([np.where(ds_columns.values == target)[0][0]], axis=1, inplace=True)
/opt/demo/demo/app_predict/predict.py:1837: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_pairplot.drop([np.where(ds_columns.values == 'buyer_retrate_gt01')[0][0]], axis=1, inplace=True)
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1   Returned_asm  \
count                          278337.000000  278337.000000   
mean                                0.072437       0.070095   
std                                 0.254448       0.255307   
min                                 0.000000       0.000000   
25%                                 0.000000       0.000000   
50%                                 0.000000       0.000000   
75%                                 0.000000       0.000000   
max                                 1.000000       1.000000   

       VIN_numReturnedNotNull  VIN_fracReturned1DivReturnedNotNull  \
count           278337.000000                        278337.000000   
mean                 0.079350                             0.020815   
std                  0.277162                             0.141990   
min                  0.000000                             0.000000   
25%                  0.000000                             0.000000   
50%                  0.000000                             0.000000   
75%                  0.000000                             0.000000   
max                  4.000000                             1.000000   

        pca_comp_78  
count  2.783370e+05  
mean   2.431927e-17  
std    8.707895e-03  
min   -2.556930e-01  
25%   -2.682069e-04  
50%    3.778332e-05  
75%    3.392039e-04  
max    2.741183e-01  

First 5 records for top 5 important features:
   VIN_fracReturnedNotNullDivDSEligible1  Returned_asm  \
0                                    0.0             0   
1                                    0.0             0   
2                                    0.0             0   
3                                    0.0             0   
4                                    0.0             0   

   VIN_numReturnedNotNull  VIN_fracReturned1DivReturnedNotNull  pca_comp_78  
0                       0                                  0.0     0.000569  
1                       0                                  0.0     0.000502  
2                       0                                  0.0     0.001204  
3                       0                                  0.0     0.000335  
4                       0                                  0.0     0.001024  

Describe top 5 important features. Format:
    Feature: importance score.
    Histogram of feature values.
VIN_fracReturnedNotNullDivDSEligible1:
    importance: 0.794
Returned_asm:
    importance: 0.106
VIN_numReturnedNotNull:
    importance: 0.041
VIN_fracReturned1DivReturnedNotNull:
    importance: 0.018
pca_comp_78:
    importance: 0.007
Time elapsed (sec) = 22.7

################################################################################
Tune model hyperparameters by optimizing the model score
    with cross validation. Model scores are R^2,
    the coefficient of determination.
    
Progress: 25% 50% 75% 100% 

Time elapsed (sec) = 13.0

################################################################################
Test significance of predictions by shuffling the target values.
    Model scores are r^2, the coefficient of determination.
    
Progress: 5% 25% 45% 65% 85% 

Average model score with shuffling: -0.147
Average model score without shuffling: 0.993
/opt/conda/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Null hypothesis: There is no distinction in the differences
between the mean model scores whether or not the target
values have been shuffled.
Outcome: Assuming the null hypothesis, the probability of
obtaining a difference between the mean model scores at least
as great as 1.14 is 0.0%.

Time elapsed (sec) = 11.3

################################################################################
Predict target values with cross-validation,
    plot actual vs predicted and score.
    
Progress: 20% 40% 60% 80% 100% 

Model score = 0.995
`features.pkl`, `estimator.pkl`: Save features and estimator.
    
Time elapsed (sec) = 5.9

In [61]:
gc.collect()
Out[61]:
19487
In [143]:
demo.utils.utils.plot_actual_vs_predicted(
    y_true=ds_target.values, y_pred=ds_predicted.values,
    loglog=False, xylims=(-1.1, 1.1),
    path=os.path.join(path_plot_dir, 'model_actual_vs_predicted.jpg'))
In [150]:
nftrs_pctls = np.asarray(
    [np.append(tup[0], np.percentile(tup[1], q=[5,50,95]))
     for tup in nftrs_scores])
plt.plot(
    nftrs_pctls[:, 0], nftrs_pctls[:, 2],
    marker='.', color=sns.color_palette()[0],
    label='50th pctl score')
plt.fill_between(
    nftrs_pctls[:, 0],
    y1=nftrs_pctls[:, 1],
    y2=nftrs_pctls[:, 3],
    alpha=0.5, color=sns.color_palette()[0],
    label='5-95th pctls of scores')
plt.title("Model score vs number of features")
plt.xlabel("Number of features")
plt.ylabel("Model score")
plt.legend(loc='upper left')
plt.savefig(os.path.join(path_plot_dir, 'model_tune_nfeatures.png'), dpi=300)
plt.show()
In [152]:
path_plot_dir = os.path.join(path_data_dir, 'plot_model')
df_importances = demo.utils.utils.calc_feature_importances(
    estimator=estimator, df_features=df_features2, ds_target=ds_target,
    replace=False, show_progress=True, show_plot=True, path=os.path.join(path_plot_dir, 'model_feature_importances.png'))
Progress: 10% 20% 30% 40% 50% 60% 70% 80% 90% 100% 

In [34]:
# To make an example random forest tree
path_plot_dir = os.path.join(path_data_dir, 'plot_model')
path_dot = os.path.join(path_plot_dir, 'model_decision_tree.dot')
print("Graphviz dot and SVG files =\n{path}\n{path}.svg".format(path=path_dot))
sk_tr.export_graphviz(
    decision_tree=estimator.estimators_[0], out_file=path_dot,
    feature_names=df_features4.columns)
cmd = ['dot', '-Tsvg', path_dot, '-O']
subprocess.run(args=cmd, check=True)
cmd = ['dot', '-Tjpng', path_dot, '-O']
subprocess.run(args=cmd, check=True)
display(SVG(filename=path_dot+'.svg'))
Graphviz dot and SVG files =
/opt/demo/demo/app_predict/data/plot_model/model_decision_tree.dot
/opt/demo/demo/app_predict/data/plot_model/model_decision_tree.dot.svg
Tree 0 VIN_fracReturnedNotNullDivDSEligible1 <= 0.7995 mse = 0.1085 samples = 222670 value = -0.9142 1 VIN_fracReturnedNotNullDivDSEligible1 <= 0.3563 mse = 0.0071 samples = 207619 value = -0.9953 0->1 True 512 Returned_asm <= 0.5898 mse = 0.1662 samples = 15051 value = 0.2042 0->512 False 2 pca_comp_78 <= -0.1371 mse = 0.0003 samples = 205601 value = -0.9998 1->2 45 pca_comp_78 <= -0.1131 mse = 0.4847 samples = 2018 value = -0.5342 1->45 3 mse = 0.0 samples = 13 value = 1.0 2->3 4 VIN_fracReturnedNotNullDivDSEligible1 <= 0.236 mse = 0.0001 samples = 205588 value = -0.9999 2->4 5 mse = 0.0 samples = 205403 value = -1.0 4->5 6 pca_comp_72 <= -0.2109 mse = 0.0653 samples = 185 value = -0.9297 4->6 7 pca_comp_72 <= -0.2399 mse = 0.2109 samples = 43 value = -0.6977 6->7 44 mse = 0.0 samples = 142 value = -1.0 6->44 8 mse = 0.0 samples = 2 value = 0.0 7->8 9 pca_comp_72 <= -0.228 mse = 0.1963 samples = 41 value = -0.7317 7->9 10 pca_comp_72 <= -0.2384 mse = 0.2417 samples = 22 value = -0.5909 9->10 31 VIN_fracReturnedNotNullDivDSEligible1 <= 0.3051 mse = 0.0942 samples = 19 value = -0.8947 9->31 11 mse = 0.0 samples = 2 value = 0.0 10->11 12 pca_comp_78 <= 0.048 mse = 0.2275 samples = 20 value = -0.65 10->12 13 pca_comp_78 <= 0.0475 mse = 0.1302 samples = 13 value = -0.8462 12->13 24 pca_comp_73 <= -0.0492 mse = 0.2041 samples = 7 value = -0.2857 12->24 14 mse = 0.0 samples = 5 value = -1.0 13->14 15 pca_comp_72 <= -0.2313 mse = 0.1875 samples = 8 value = -0.75 13->15 16 pca_comp_75 <= 0.1361 mse = 0.25 samples = 2 value = -0.5 15->16 19 pca_comp_72 <= -0.2304 mse = 0.1389 samples = 6 value = -0.8333 15->19 17 mse = 0.0 samples = 1 value = 0.0 16->17 18 mse = 0.0 samples = 1 value = -1.0 16->18 20 pca_comp_75 <= 0.1371 mse = 0.2222 samples = 3 value = -0.6667 19->20 23 mse = 0.0 samples = 3 value = -1.0 19->23 21 mse = 0.0 samples = 2 value = -1.0 20->21 22 mse = 0.0 samples = 1 value = 0.0 20->22 25 pca_comp_76 <= 0.1411 mse = 0.1389 samples = 6 value = -0.1667 24->25 30 mse = 0.0 samples = 1 value = -1.0 24->30 26 pca_comp_76 <= 0.1402 mse = 0.25 samples = 2 value = -0.5 25->26 29 mse = 0.0 samples = 4 value = 0.0 25->29 27 mse = 0.0 samples = 1 value = -1.0 26->27 28 mse = 0.0 samples = 1 value = 0.0 26->28 32 pca_comp_76 <= 0.1448 mse = 0.1875 samples = 4 value = -0.75 31->32 35 pca_comp_76 <= 0.1421 mse = 0.0622 samples = 15 value = -0.9333 31->35 33 mse = 0.0 samples = 3 value = -1.0 32->33 34 mse = 0.0 samples = 1 value = 0.0 32->34 36 pca_comp_75 <= 0.1366 mse = 0.1224 samples = 7 value = -0.8571 35->36 43 mse = 0.0 samples = 8 value = -1.0 35->43 37 mse = 0.0 samples = 2 value = -1.0 36->37 38 pca_comp_75 <= 0.1373 mse = 0.16 samples = 5 value = -0.8 36->38 39 mse = 0.0 samples = 2 value = -1.0 38->39 40 pca_comp_75 <= 0.1391 mse = 0.2222 samples = 3 value = -0.6667 38->40 41 mse = 0.0 samples = 2 value = -1.0 40->41 42 mse = 0.0 samples = 1 value = 0.0 40->42 46 mse = 0.0 samples = 234 value = 1.0 45->46 47 VIN_fracReturned1DivReturnedNotNull <= 0.5001 mse = 0.1991 samples = 1784 value = -0.7354 45->47 48 pca_comp_73 <= -0.499 mse = 0.2356 samples = 712 value = -0.3399 47->48 509 VIN_numReturnedNotNull <= 2.4689 mse = 0.0019 samples = 1072 value = -0.9981 47->509 49 mse = 0.0 samples = 4 value = 1.0 48->49 50 VIN_fracReturned1DivReturnedNotNull <= 0.0614 mse = 0.2267 samples = 708 value = -0.3475 48->50 51 pca_comp_76 <= 0.2518 mse = 0.2242 samples = 675 value = -0.3393 50->51 470 pca_comp_73 <= -0.1966 mse = 0.2498 samples = 33 value = -0.5152 50->470 52 pca_comp_75 <= -0.4215 mse = 0.2253 samples = 668 value = -0.3428 51->52 469 mse = 0.0 samples = 7 value = 0.0 51->469 53 mse = 0.0 samples = 1 value = -1.0 52->53 54 pca_comp_78 <= 0.0351 mse = 0.225 samples = 667 value = -0.3418 52->54 55 pca_comp_72 <= -0.1887 mse = 0.2482 samples = 378 value = -0.4577 54->55 320 pca_comp_78 <= 0.038 mse = 0.1541 samples = 289 value = -0.1903 54->320 56 pca_comp_72 <= -0.1895 mse = 0.2082 samples = 220 value = -0.2955 55->56 195 pca_comp_75 <= 0.0983 mse = 0.2163 samples = 158 value = -0.6835 55->195 57 pca_comp_73 <= -0.0698 mse = 0.1869 samples = 193 value = -0.2487 56->57 174 pca_comp_72 <= -0.1888 mse = 0.2332 samples = 27 value = -0.6296 56->174 58 mse = 0.0 samples = 9 value = 0.0 57->58 59 pca_comp_73 <= -0.0518 mse = 0.1928 samples = 184 value = -0.2609 57->59 60 pca_comp_76 <= 0.1144 mse = 0.2065 samples = 151 value = -0.2914 59->60 167 pca_comp_75 <= 0.0975 mse = 0.1065 samples = 33 value = -0.1212 59->167 61 mse = 0.0 samples = 8 value = 0.0 60->61 62 pca_comp_73 <= -0.0547 mse = 0.213 samples = 143 value = -0.3077 60->62 63 pca_comp_72 <= -0.2 mse = 0.2259 samples = 87 value = -0.3448 62->63 124 pca_comp_76 <= 0.1192 mse = 0.1875 samples = 56 value = -0.25 62->124 64 mse = 0.0 samples = 10 value = 0.0 63->64 65 pca_comp_72 <= -0.1915 mse = 0.2378 samples = 77 value = -0.3896 63->65 66 pca_comp_75 <= 0.0971 mse = 0.1561 samples = 31 value = -0.1935 65->66 83 pca_comp_76 <= 0.1194 mse = 0.2495 samples = 46 value = -0.5217 65->83 67 pca_comp_76 <= 0.116 mse = 0.25 samples = 2 value = -0.5 66->67 70 pca_comp_72 <= -0.1918 mse = 0.1427 samples = 29 value = -0.1724 66->70 68 mse = 0.0 samples = 1 value = 0.0 67->68 69 mse = 0.0 samples = 1 value = -1.0 67->69 71 pca_comp_72 <= -0.1928 mse = 0.1134 samples = 23 value = -0.1304 70->71 80 pca_comp_75 <= 0.0979 mse = 0.2222 samples = 6 value = -0.3333 70->80 72 mse = 0.0 samples = 11 value = 0.0 71->72 73 pca_comp_73 <= -0.0566 mse = 0.1875 samples = 12 value = -0.25 71->73 74 pca_comp_72 <= -0.1919 mse = 0.24 samples = 5 value = -0.6 73->74 79 mse = 0.0 samples = 7 value = 0.0 73->79 75 pca_comp_76 <= 0.1188 mse = 0.1875 samples = 4 value = -0.75 74->75 78 mse = 0.0 samples = 1 value = 0.0 74->78 76 mse = 0.0 samples = 3 value = -1.0 75->76 77 mse = 0.0 samples = 1 value = 0.0 75->77 81 mse = 0.0 samples = 2 value = -1.0 80->81 82 mse = 0.0 samples = 4 value = 0.0 80->82 84 pca_comp_76 <= 0.1183 mse = 0.2494 samples = 40 value = -0.475 83->84 119 pca_comp_76 <= 0.1212 mse = 0.1389 samples = 6 value = -0.8333 83->119 85 pca_comp_78 <= 0.034 mse = 0.1094 samples = 8 value = -0.875 84->85 88 pca_comp_72 <= -0.1913 mse = 0.2344 samples = 32 value = -0.375 84->88 86 mse = 0.0 samples = 1 value = 0.0 85->86 87 mse = 0.0 samples = 7 value = -1.0 85->87 89 mse = 0.0 samples = 2 value = 0.0 88->89 90 pca_comp_75 <= 0.101 mse = 0.24 samples = 30 value = -0.4 88->90 91 pca_comp_72 <= -0.191 mse = 0.2449 samples = 28 value = -0.4286 90->91 118 mse = 0.0 samples = 2 value = 0.0 90->118 92 pca_comp_72 <= -0.1913 mse = 0.24 samples = 5 value = -0.6 91->92 99 pca_comp_73 <= -0.0557 mse = 0.2382 samples = 23 value = -0.3913 91->99 93 mse = 0.0 samples = 1 value = -1.0 92->93 94 pca_comp_78 <= 0.0347 mse = 0.25 samples = 4 value = -0.5 92->94 95 mse = 0.0 samples = 1 value = -1.0 94->95 96 pca_comp_72 <= -0.1912 mse = 0.2222 samples = 3 value = -0.3333 94->96 97 mse = 0.0 samples = 1 value = -1.0 96->97 98 mse = 0.0 samples = 2 value = 0.0 96->98 100 pca_comp_73 <= -0.0571 mse = 0.2148 samples = 16 value = -0.3125 99->100 113 pca_comp_73 <= -0.0553 mse = 0.2449 samples = 7 value = -0.5714 99->113 101 pca_comp_78 <= 0.035 mse = 0.24 samples = 5 value = -0.6 100->101 108 pca_comp_73 <= -0.0565 mse = 0.1488 samples = 11 value = -0.1818 100->108 102 pca_comp_76 <= 0.1186 mse = 0.1875 samples = 4 value = -0.75 101->102 107 mse = 0.0 samples = 1 value = 0.0 101->107 103 pca_comp_78 <= 0.0345 mse = 0.25 samples = 2 value = -0.5 102->103 106 mse = 0.0 samples = 2 value = -1.0 102->106 104 mse = 0.0 samples = 1 value = 0.0 103->104 105 mse = 0.0 samples = 1 value = -1.0 103->105 109 pca_comp_75 <= 0.0987 mse = 0.25 samples = 4 value = -0.5 108->109 112 mse = 0.0 samples = 7 value = 0.0 108->112 110 mse = 0.0 samples = 2 value = -1.0 109->110 111 mse = 0.0 samples = 2 value = 0.0 109->111 114 mse = 0.0 samples = 3 value = -1.0 113->114 115 pca_comp_73 <= -0.055 mse = 0.1875 samples = 4 value = -0.25 113->115 116 mse = 0.0 samples = 3 value = 0.0 115->116 117 mse = 0.0 samples = 1 value = -1.0 115->117 120 pca_comp_73 <= -0.0631 mse = 0.1875 samples = 4 value = -0.75 119->120 123 mse = 0.0 samples = 2 value = -1.0 119->123 121 mse = 0.0 samples = 1 value = 0.0 120->121 122 mse = 0.0 samples = 3 value = -1.0 120->122 125 pca_comp_73 <= -0.0541 mse = 0.2016 samples = 50 value = -0.28 124->125 166 mse = 0.0 samples = 6 value = 0.0 124->166 126 pca_comp_73 <= -0.0543 mse = 0.2296 samples = 14 value = -0.3571 125->126 137 pca_comp_75 <= 0.0993 mse = 0.1875 samples = 36 value = -0.25 125->137 127 pca_comp_72 <= -0.1926 mse = 0.2222 samples = 6 value = -0.6667 126->127 132 pca_comp_72 <= -0.1904 mse = 0.1094 samples = 8 value = -0.125 126->132 128 mse = 0.0 samples = 1 value = 0.0 127->128 129 pca_comp_72 <= -0.1916 mse = 0.16 samples = 5 value = -0.8 127->129 130 mse = 0.0 samples = 1 value = 0.0 129->130 131 mse = 0.0 samples = 4 value = -1.0 129->131 133 mse = 0.0 samples = 6 value = 0.0 132->133 134 pca_comp_78 <= 0.0343 mse = 0.25 samples = 2 value = -0.5 132->134 135 mse = 0.0 samples = 1 value = 0.0 134->135 136 mse = 0.0 samples = 1 value = -1.0 134->136 138 pca_comp_78 <= 0.0351 mse = 0.2021 samples = 32 value = -0.2812 137->138 165 mse = 0.0 samples = 4 value = 0.0 137->165 139 pca_comp_76 <= 0.1171 mse = 0.1956 samples = 30 value = -0.2667 138->139 162 pca_comp_72 <= -0.1929 mse = 0.25 samples = 2 value = -0.5 138->162 140 pca_comp_78 <= 0.0349 mse = 0.1875 samples = 4 value = -0.75 139->140 143 pca_comp_72 <= -0.1934 mse = 0.1553 samples = 26 value = -0.1923 139->143 141 mse = 0.0 samples = 3 value = -1.0 140->141 142 mse = 0.0 samples = 1 value = 0.0 140->142 144 mse = 0.0 samples = 6 value = 0.0 143->144 145 pca_comp_78 <= 0.0349 mse = 0.1875 samples = 20 value = -0.25 143->145 146 pca_comp_75 <= 0.0982 mse = 0.16 samples = 10 value = -0.2 145->146 153 pca_comp_75 <= 0.0986 mse = 0.21 samples = 10 value = -0.3 145->153 147 pca_comp_73 <= -0.0531 mse = 0.2222 samples = 6 value = -0.3333 146->147 152 mse = 0.0 samples = 4 value = 0.0 146->152 148 pca_comp_73 <= -0.0534 mse = 0.25 samples = 4 value = -0.5 147->148 151 mse = 0.0 samples = 2 value = 0.0 147->151 149 mse = 0.0 samples = 2 value = -1.0 148->149 150 mse = 0.0 samples = 2 value = 0.0 148->150 154 pca_comp_75 <= 0.0975 mse = 0.1224 samples = 7 value = -0.1429 153->154 157 pca_comp_72 <= -0.1917 mse = 0.2222 samples = 3 value = -0.6667 153->157 155 mse = 0.0 samples = 1 value = -1.0 154->155 156 mse = 0.0 samples = 6 value = 0.0 154->156 158 pca_comp_78 <= 0.035 mse = 0.25 samples = 2 value = -0.5 157->158 161 mse = 0.0 samples = 1 value = -1.0 157->161 159 mse = 0.0 samples = 1 value = -1.0 158->159 160 mse = 0.0 samples = 1 value = 0.0 158->160 163 mse = 0.0 samples = 1 value = 0.0 162->163 164 mse = 0.0 samples = 1 value = -1.0 162->164 168 pca_comp_75 <= 0.097 mse = 0.1956 samples = 15 value = -0.2667 167->168 173 mse = 0.0 samples = 18 value = 0.0 167->173 169 pca_comp_76 <= 0.1176 mse = 0.2449 samples = 7 value = -0.5714 168->169 172 mse = 0.0 samples = 8 value = 0.0 168->172 170 mse = 0.0 samples = 4 value = -1.0 169->170 171 mse = 0.0 samples = 3 value = 0.0 169->171 175 pca_comp_76 <= 0.1194 mse = 0.2479 samples = 22 value = -0.5455 174->175 194 mse = 0.0 samples = 5 value = -1.0 174->194 176 pca_comp_76 <= 0.1179 mse = 0.2284 samples = 17 value = -0.6471 175->176 191 pca_comp_73 <= -0.0584 mse = 0.16 samples = 5 value = -0.2 175->191 177 pca_comp_78 <= 0.0346 mse = 0.1224 samples = 7 value = -0.8571 176->177 182 pca_comp_76 <= 0.1185 mse = 0.25 samples = 10 value = -0.5 176->182 178 mse = 0.0 samples = 4 value = -1.0 177->178 179 pca_comp_76 <= 0.1177 mse = 0.2222 samples = 3 value = -0.6667 177->179 180 mse = 0.0 samples = 2 value = -1.0 179->180 181 mse = 0.0 samples = 1 value = 0.0 179->181 183 mse = 0.0 samples = 2 value = 0.0 182->183 184 pca_comp_73 <= -0.0573 mse = 0.2344 samples = 8 value = -0.625 182->184 185 mse = 0.0 samples = 2 value = 0.0 184->185 186 pca_comp_76 <= 0.1188 mse = 0.1389 samples = 6 value = -0.8333 184->186 187 pca_comp_76 <= 0.1187 mse = 0.25 samples = 2 value = -0.5 186->187 190 mse = 0.0 samples = 4 value = -1.0 186->190 188 mse = 0.0 samples = 1 value = 0.0 187->188 189 mse = 0.0 samples = 1 value = -1.0 187->189 192 mse = 0.0 samples = 4 value = 0.0 191->192 193 mse = 0.0 samples = 1 value = -1.0 191->193 196 pca_comp_78 <= 0.0343 mse = 0.1178 samples = 22 value = -0.8636 195->196 209 pca_comp_75 <= 0.1 mse = 0.2262 samples = 136 value = -0.6544 195->209 197 mse = 0.0 samples = 5 value = -1.0 196->197 198 pca_comp_72 <= -0.1868 mse = 0.1453 samples = 17 value = -0.8235 196->198 199 pca_comp_72 <= -0.1869 mse = 0.1156 samples = 15 value = -0.8667 198->199 206 pca_comp_73 <= -0.0533 mse = 0.25 samples = 2 value = -0.5 198->206 200 pca_comp_72 <= -0.1874 mse = 0.0663 samples = 14 value = -0.9286 199->200 205 mse = 0.0 samples = 1 value = 0.0 199->205 201 mse = 0.0 samples = 10 value = -1.0 200->201 202 pca_comp_75 <= 0.098 mse = 0.1875 samples = 4 value = -0.75 200->202 203 mse = 0.0 samples = 3 value = -1.0 202->203 204 mse = 0.0 samples = 1 value = 0.0 202->204 207 mse = 0.0 samples = 1 value = 0.0 206->207 208 mse = 0.0 samples = 1 value = -1.0 206->208 210 pca_comp_78 <= 0.034 mse = 0.2478 samples = 53 value = -0.5472 209->210 261 pca_comp_75 <= 0.1095 mse = 0.2003 samples = 83 value = -0.7229 209->261 211 mse = 0.0 samples = 4 value = -1.0 210->211 212 pca_comp_78 <= 0.034 mse = 0.2499 samples = 49 value = -0.5102 210->212 213 mse = 0.0 samples = 1 value = 0.0 212->213 214 pca_comp_76 <= 0.1184 mse = 0.2496 samples = 48 value = -0.5208 212->214 215 pca_comp_78 <= 0.0349 mse = 0.2222 samples = 15 value = -0.6667 214->215 226 pca_comp_75 <= 0.0986 mse = 0.2479 samples = 33 value = -0.4545 214->226 216 pca_comp_72 <= -0.1878 mse = 0.09 samples = 10 value = -0.9 215->216 223 pca_comp_72 <= -0.1872 mse = 0.16 samples = 5 value = -0.2 215->223 217 mse = 0.0 samples = 4 value = -1.0 216->217 218 pca_comp_78 <= 0.0347 mse = 0.1389 samples = 6 value = -0.8333 216->218 219 mse = 0.0 samples = 4 value = -1.0 218->219 220 pca_comp_72 <= -0.187 mse = 0.25 samples = 2 value = -0.5 218->220 221 mse = 0.0 samples = 1 value = -1.0 220->221 222 mse = 0.0 samples = 1 value = 0.0 220->222 224 mse = 0.0 samples = 4 value = 0.0 223->224 225 mse = 0.0 samples = 1 value = -1.0 223->225 227 pca_comp_72 <= -0.1882 mse = 0.1875 samples = 4 value = -0.75 226->227 232 pca_comp_76 <= 0.1192 mse = 0.2426 samples = 29 value = -0.4138 226->232 228 pca_comp_76 <= 0.119 mse = 0.25 samples = 2 value = -0.5 227->228 231 mse = 0.0 samples = 2 value = -1.0 227->231 229 mse = 0.0 samples = 1 value = 0.0 228->229 230 mse = 0.0 samples = 1 value = -1.0 228->230 233 pca_comp_76 <= 0.119 mse = 0.2469 samples = 18 value = -0.5556 232->233 252 pca_comp_72 <= -0.1856 mse = 0.1488 samples = 11 value = -0.1818 232->252 234 pca_comp_78 <= 0.0346 mse = 0.2431 samples = 12 value = -0.4167 233->234 249 pca_comp_72 <= -0.1861 mse = 0.1389 samples = 6 value = -0.8333 233->249 235 pca_comp_72 <= -0.1878 mse = 0.2041 samples = 7 value = -0.2857 234->235 244 pca_comp_76 <= 0.1187 mse = 0.24 samples = 5 value = -0.6 234->244 236 mse = 0.0 samples = 2 value = 0.0 235->236 237 pca_comp_76 <= 0.1189 mse = 0.24 samples = 5 value = -0.4 235->237 238 pca_comp_72 <= -0.1871 mse = 0.25 samples = 4 value = -0.5 237->238 243 mse = 0.0 samples = 1 value = 0.0 237->243 239 pca_comp_76 <= 0.1188 mse = 0.2222 samples = 3 value = -0.3333 238->239 242 mse = 0.0 samples = 1 value = -1.0 238->242 240 mse = 0.0 samples = 2 value = 0.0 239->240 241 mse = 0.0 samples = 1 value = -1.0 239->241 245 mse = 0.0 samples = 2 value = -1.0 244->245 246 pca_comp_72 <= -0.1868 mse = 0.2222 samples = 3 value = -0.3333 244->246 247 mse = 0.0 samples = 2 value = 0.0 246->247 248 mse = 0.0 samples = 1 value = -1.0 246->248 250 mse = 0.0 samples = 5 value = -1.0 249->250 251 mse = 0.0 samples = 1 value = 0.0 249->251 253 pca_comp_78 <= 0.0346 mse = 0.1094 samples = 8 value = -0.125 252->253 258 pca_comp_72 <= -0.1853 mse = 0.2222 samples = 3 value = -0.3333 252->258 254 pca_comp_76 <= 0.1197 mse = 0.2222 samples = 3 value = -0.3333 253->254 257 mse = 0.0 samples = 5 value = 0.0 253->257 255 mse = 0.0 samples = 1 value = -1.0 254->255 256 mse = 0.0 samples = 2 value = 0.0 254->256 259 mse = 0.0 samples = 1 value = -1.0 258->259 260 mse = 0.0 samples = 2 value = 0.0 258->260 262 pca_comp_78 <= 0.0304 mse = 0.1827 samples = 79 value = -0.7595 261->262 319 mse = 0.0 samples = 4 value = 0.0 261->319 263 mse = 0.0 samples = 1 value = 0.0 262->263 264 pca_comp_75 <= 0.1011 mse = 0.1775 samples = 78 value = -0.7692 262->264 265 pca_comp_73 <= -0.0564 mse = 0.1503 samples = 38 value = -0.8158 264->265 294 pca_comp_72 <= -0.1864 mse = 0.1994 samples = 40 value = -0.725 264->294 266 pca_comp_73 <= -0.0656 mse = 0.1389 samples = 36 value = -0.8333 265->266 291 pca_comp_72 <= -0.1866 mse = 0.25 samples = 2 value = -0.5 265->291 267 mse = 0.0 samples = 1 value = 0.0 266->267 268 pca_comp_72 <= -0.1821 mse = 0.1224 samples = 35 value = -0.8571 266->268 269 pca_comp_75 <= 0.1003 mse = 0.1065 samples = 33 value = -0.8788 268->269 288 pca_comp_76 <= 0.1188 mse = 0.25 samples = 2 value = -0.5 268->288 270 mse = 0.0 samples = 7 value = -1.0 269->270 271 pca_comp_73 <= -0.057 mse = 0.1302 samples = 26 value = -0.8462 269->271 272 pca_comp_75 <= 0.1007 mse = 0.1056 samples = 25 value = -0.88 271->272 287 mse = 0.0 samples = 1 value = 0.0 271->287 273 pca_comp_72 <= -0.1858 mse = 0.1453 samples = 17 value = -0.8235 272->273 286 mse = 0.0 samples = 8 value = -1.0 272->286 274 pca_comp_76 <= 0.1193 mse = 0.071 samples = 13 value = -0.9231 273->274 283 pca_comp_76 <= 0.1185 mse = 0.25 samples = 4 value = -0.5 273->283 275 mse = 0.0 samples = 7 value = -1.0 274->275 276 pca_comp_78 <= 0.0346 mse = 0.1389 samples = 6 value = -0.8333 274->276 277 mse = 0.0 samples = 2 value = -1.0 276->277 278 pca_comp_72 <= -0.1866 mse = 0.1875 samples = 4 value = -0.75 276->278 279 pca_comp_72 <= -0.1867 mse = 0.25 samples = 2 value = -0.5 278->279 282 mse = 0.0 samples = 2 value = -1.0 278->282 280 mse = 0.0 samples = 1 value = -1.0 279->280 281 mse = 0.0 samples = 1 value = 0.0 279->281 284 mse = 0.0 samples = 2 value = -1.0 283->284 285 mse = 0.0 samples = 2 value = 0.0 283->285 289 mse = 0.0 samples = 1 value = -1.0 288->289 290 mse = 0.0 samples = 1 value = 0.0 288->290 292 mse = 0.0 samples = 1 value = 0.0 291->292 293 mse = 0.0 samples = 1 value = -1.0 291->293 295 pca_comp_78 <= 0.0333 mse = 0.25 samples = 6 value = -0.5 294->295 300 pca_comp_72 <= -0.1774 mse = 0.1799 samples = 34 value = -0.7647 294->300 296 mse = 0.0 samples = 2 value = 0.0 295->296 297 pca_comp_76 <= 0.1201 mse = 0.1875 samples = 4 value = -0.75 295->297 298 mse = 0.0 samples = 3 value = -1.0 297->298 299 mse = 0.0 samples = 1 value = 0.0 297->299 301 pca_comp_75 <= 0.1019 mse = 0.2117 samples = 23 value = -0.6957 300->301 312 pca_comp_75 <= 0.1062 mse = 0.0826 samples = 11 value = -0.9091 300->312 302 mse = 0.0 samples = 8 value = -1.0 301->302 303 pca_comp_78 <= 0.0339 mse = 0.2489 samples = 15 value = -0.5333 301->303 304 pca_comp_72 <= -0.1823 mse = 0.21 samples = 10 value = -0.3 303->304 311 mse = 0.0 samples = 5 value = -1.0 303->311 305 mse = 0.0 samples = 3 value = 0.0 304->305 306 pca_comp_76 <= 0.1207 mse = 0.2449 samples = 7 value = -0.4286 304->306 307 pca_comp_75 <= 0.1029 mse = 0.1875 samples = 4 value = -0.75 306->307 310 mse = 0.0 samples = 3 value = 0.0 306->310 308 mse = 0.0 samples = 1 value = 0.0 307->308 309 mse = 0.0 samples = 3 value = -1.0 307->309 313 pca_comp_76 <= 0.1211 mse = 0.1389 samples = 6 value = -0.8333 312->313 318 mse = 0.0 samples = 5 value = -1.0 312->318 314 mse = 0.0 samples = 3 value = -1.0 313->314 315 pca_comp_76 <= 0.1216 mse = 0.2222 samples = 3 value = -0.6667 313->315 316 mse = 0.0 samples = 1 value = 0.0 315->316 317 mse = 0.0 samples = 2 value = -1.0 315->317 321 pca_comp_73 <= -0.0675 mse = 0.1512 samples = 280 value = -0.1857 320->321 462 pca_comp_72 <= -0.1921 mse = 0.2222 samples = 9 value = -0.3333 320->462 322 mse = 0.0 samples = 4 value = 0.0 321->322 323 pca_comp_75 <= 0.099 mse = 0.1529 samples = 276 value = -0.1884 321->323 324 pca_comp_75 <= 0.0952 mse = 0.1789 samples = 150 value = -0.2333 323->324 419 pca_comp_72 <= -0.1955 mse = 0.1167 samples = 126 value = -0.1349 323->419 325 pca_comp_76 <= 0.1136 mse = 0.1224 samples = 28 value = -0.1429 324->325 334 pca_comp_72 <= -0.1924 mse = 0.1895 samples = 122 value = -0.2541 324->334 326 pca_comp_72 <= -0.1993 mse = 0.0434 samples = 22 value = -0.0455 325->326 329 pca_comp_72 <= -0.1986 mse = 0.25 samples = 6 value = -0.5 325->329 327 mse = 0.0 samples = 21 value = 0.0 326->327 328 mse = 0.0 samples = 1 value = -1.0 326->328 330 mse = 0.0 samples = 2 value = 0.0 329->330 331 pca_comp_73 <= -0.0421 mse = 0.1875 samples = 4 value = -0.75 329->331 332 mse = 0.0 samples = 1 value = 0.0 331->332 333 mse = 0.0 samples = 3 value = -1.0 331->333 335 pca_comp_75 <= 0.0964 mse = 0.1444 samples = 80 value = -0.175 334->335 384 pca_comp_72 <= -0.1894 mse = 0.2409 samples = 42 value = -0.4048 334->384 336 pca_comp_76 <= 0.1144 mse = 0.25 samples = 14 value = -0.5 335->336 345 pca_comp_78 <= 0.0355 mse = 0.0948 samples = 66 value = -0.1061 335->345 337 mse = 0.0 samples = 6 value = -1.0 336->337 338 pca_comp_75 <= 0.0959 mse = 0.1094 samples = 8 value = -0.125 336->338 339 pca_comp_76 <= 0.1156 mse = 0.16 samples = 5 value = -0.2 338->339 344 mse = 0.0 samples = 3 value = 0.0 338->344 340 pca_comp_75 <= 0.0953 mse = 0.25 samples = 2 value = -0.5 339->340 343 mse = 0.0 samples = 3 value = 0.0 339->343 341 mse = 0.0 samples = 1 value = 0.0 340->341 342 mse = 0.0 samples = 1 value = -1.0 340->342 346 pca_comp_75 <= 0.0985 mse = 0.1437 samples = 23 value = -0.1739 345->346 357 pca_comp_72 <= -0.1944 mse = 0.0649 samples = 43 value = -0.0698 345->357 347 pca_comp_73 <= -0.0511 mse = 0.1875 samples = 16 value = -0.25 346->347 356 mse = 0.0 samples = 7 value = 0.0 346->356 348 mse = 0.0 samples = 10 value = 0.0 347->348 349 pca_comp_76 <= 0.117 mse = 0.2222 samples = 6 value = -0.6667 347->349 350 mse = 0.0 samples = 1 value = 0.0 349->350 351 pca_comp_73 <= -0.048 mse = 0.16 samples = 5 value = -0.8 349->351 352 mse = 0.0 samples = 2 value = -1.0 351->352 353 pca_comp_73 <= -0.0471 mse = 0.2222 samples = 3 value = -0.6667 351->353 354 mse = 0.0 samples = 1 value = 0.0 353->354 355 mse = 0.0 samples = 2 value = -1.0 353->355 358 pca_comp_76 <= 0.1177 mse = 0.0384 samples = 25 value = -0.04 357->358 371 pca_comp_75 <= 0.0978 mse = 0.0988 samples = 18 value = -0.1111 357->371 359 pca_comp_72 <= -0.1966 mse = 0.0663 samples = 14 value = -0.0714 358->359 370 mse = 0.0 samples = 11 value = 0.0 358->370 360 pca_comp_75 <= 0.0969 mse = 0.1094 samples = 8 value = -0.125 359->360 369 mse = 0.0 samples = 6 value = 0.0 359->369 361 mse = 0.0 samples = 2 value = 0.0 360->361 362 pca_comp_75 <= 0.0978 mse = 0.1389 samples = 6 value = -0.1667 360->362 363 mse = 0.0 samples = 2 value = 0.0 362->363 364 pca_comp_76 <= 0.1174 mse = 0.1875 samples = 4 value = -0.25 362->364 365 pca_comp_75 <= 0.0981 mse = 0.25 samples = 2 value = -0.5 364->365 368 mse = 0.0 samples = 2 value = 0.0 364->368 366 mse = 0.0 samples = 1 value = 0.0 365->366 367 mse = 0.0 samples = 1 value = -1.0 365->367 372 mse = 0.0 samples = 6 value = 0.0 371->372 373 pca_comp_76 <= 0.1162 mse = 0.1389 samples = 12 value = -0.1667 371->373 374 pca_comp_72 <= -0.1927 mse = 0.2222 samples = 3 value = -0.3333 373->374 377 pca_comp_76 <= 0.1183 mse = 0.0988 samples = 9 value = -0.1111 373->377 375 mse = 0.0 samples = 2 value = 0.0 374->375 376 mse = 0.0 samples = 1 value = -1.0 374->376 378 mse = 0.0 samples = 4 value = 0.0 377->378 379 pca_comp_72 <= -0.1931 mse = 0.16 samples = 5 value = -0.2 377->379 380 mse = 0.0 samples = 2 value = 0.0 379->380 381 pca_comp_76 <= 0.1189 mse = 0.2222 samples = 3 value = -0.3333 379->381 382 mse = 0.0 samples = 2 value = 0.0 381->382 383 mse = 0.0 samples = 1 value = -1.0 381->383 385 pca_comp_72 <= -0.1916 mse = 0.2256 samples = 32 value = -0.3438 384->385 408 pca_comp_73 <= -0.0613 mse = 0.24 samples = 10 value = -0.6 384->408 386 mse = 0.0 samples = 7 value = 0.0 385->386 387 pca_comp_72 <= -0.1909 mse = 0.2464 samples = 25 value = -0.44 385->387 388 pca_comp_73 <= -0.0486 mse = 0.1224 samples = 7 value = -0.1429 387->388 391 pca_comp_76 <= 0.1185 mse = 0.2469 samples = 18 value = -0.5556 387->391 389 mse = 0.0 samples = 6 value = 0.0 388->389 390 mse = 0.0 samples = 1 value = -1.0 388->390 392 pca_comp_73 <= -0.0462 mse = 0.2222 samples = 12 value = -0.6667 391->392 403 pca_comp_76 <= 0.1191 mse = 0.2222 samples = 6 value = -0.3333 391->403 393 pca_comp_75 <= 0.0964 mse = 0.1983 samples = 11 value = -0.7273 392->393 402 mse = 0.0 samples = 1 value = 0.0 392->402 394 mse = 0.0 samples = 1 value = 0.0 393->394 395 pca_comp_73 <= -0.0487 mse = 0.16 samples = 10 value = -0.8 393->395 396 mse = 0.0 samples = 5 value = -1.0 395->396 397 pca_comp_75 <= 0.0976 mse = 0.24 samples = 5 value = -0.6 395->397 398 mse = 0.0 samples = 2 value = -1.0 397->398 399 pca_comp_73 <= -0.0475 mse = 0.2222 samples = 3 value = -0.3333 397->399 400 mse = 0.0 samples = 1 value = -1.0 399->400 401 mse = 0.0 samples = 2 value = 0.0 399->401 404 mse = 0.0 samples = 3 value = 0.0 403->404 405 pca_comp_73 <= -0.0575 mse = 0.2222 samples = 3 value = -0.6667 403->405 406 mse = 0.0 samples = 1 value = 0.0 405->406 407 mse = 0.0 samples = 2 value = -1.0 405->407 409 mse = 0.0 samples = 1 value = 0.0 408->409 410 pca_comp_75 <= 0.0988 mse = 0.2222 samples = 9 value = -0.6667 408->410 411 pca_comp_76 <= 0.1169 mse = 0.1875 samples = 8 value = -0.75 410->411 418 mse = 0.0 samples = 1 value = 0.0 410->418 412 mse = 0.0 samples = 3 value = -1.0 411->412 413 pca_comp_72 <= -0.1887 mse = 0.24 samples = 5 value = -0.6 411->413 414 pca_comp_72 <= -0.1891 mse = 0.2222 samples = 3 value = -0.3333 413->414 417 mse = 0.0 samples = 2 value = -1.0 413->417 415 mse = 0.0 samples = 2 value = 0.0 414->415 416 mse = 0.0 samples = 1 value = -1.0 414->416 420 mse = 0.0 samples = 67 value = 0.0 419->420 421 pca_comp_72 <= -0.1878 mse = 0.2051 samples = 59 value = -0.2881 419->421 422 pca_comp_72 <= -0.1908 mse = 0.1412 samples = 47 value = -0.1702 421->422 453 pca_comp_75 <= 0.0996 mse = 0.1875 samples = 12 value = -0.75 421->453 423 pca_comp_72 <= -0.1941 mse = 0.085 samples = 32 value = -0.0938 422->423 440 pca_comp_72 <= -0.1895 mse = 0.2222 samples = 15 value = -0.3333 422->440 424 mse = 0.0 samples = 8 value = 0.0 423->424 425 pca_comp_75 <= 0.0999 mse = 0.1094 samples = 24 value = -0.125 423->425 426 mse = 0.0 samples = 9 value = 0.0 425->426 427 pca_comp_72 <= -0.1923 mse = 0.16 samples = 15 value = -0.2 425->427 428 pca_comp_78 <= 0.0363 mse = 0.1094 samples = 8 value = -0.125 427->428 433 pca_comp_76 <= 0.1177 mse = 0.2041 samples = 7 value = -0.2857 427->433 429 mse = 0.0 samples = 5 value = 0.0 428->429 430 pca_comp_76 <= 0.1155 mse = 0.2222 samples = 3 value = -0.3333 428->430 431 mse = 0.0 samples = 2 value = 0.0 430->431 432 mse = 0.0 samples = 1 value = -1.0 430->432 434 mse = 0.0 samples = 1 value = -1.0 433->434 435 pca_comp_76 <= 0.1184 mse = 0.1389 samples = 6 value = -0.1667 433->435 436 pca_comp_73 <= -0.0497 mse = 0.25 samples = 2 value = -0.5 435->436 439 mse = 0.0 samples = 4 value = 0.0 435->439 437 mse = 0.0 samples = 1 value = -1.0 436->437 438 mse = 0.0 samples = 1 value = 0.0 436->438 441 pca_comp_78 <= 0.0365 mse = 0.0988 samples = 9 value = -0.1111 440->441 446 pca_comp_76 <= 0.1173 mse = 0.2222 samples = 6 value = -0.6667 440->446 442 mse = 0.0 samples = 7 value = 0.0 441->442 443 pca_comp_73 <= -0.0483 mse = 0.25 samples = 2 value = -0.5 441->443 444 mse = 0.0 samples = 1 value = 0.0 443->444 445 mse = 0.0 samples = 1 value = -1.0 443->445 447 mse = 0.0 samples = 2 value = -1.0 446->447 448 pca_comp_76 <= 0.1189 mse = 0.25 samples = 4 value = -0.5 446->448 449 pca_comp_73 <= -0.0572 mse = 0.2222 samples = 3 value = -0.6667 448->449 452 mse = 0.0 samples = 1 value = 0.0 448->452 450 mse = 0.0 samples = 1 value = 0.0 449->450 451 mse = 0.0 samples = 2 value = -1.0 449->451 454 mse = 0.0 samples = 4 value = -1.0 453->454 455 pca_comp_72 <= -0.1863 mse = 0.2344 samples = 8 value = -0.625 453->455 456 pca_comp_72 <= -0.1871 mse = 0.1389 samples = 6 value = -0.8333 455->456 461 mse = 0.0 samples = 2 value = 0.0 455->461 457 pca_comp_76 <= 0.1184 mse = 0.2222 samples = 3 value = -0.6667 456->457 460 mse = 0.0 samples = 3 value = -1.0 456->460 458 mse = 0.0 samples = 2 value = -1.0 457->458 459 mse = 0.0 samples = 1 value = 0.0 457->459 463 mse = 0.0 samples = 4 value = 0.0 462->463 464 pca_comp_73 <= -0.0265 mse = 0.24 samples = 5 value = -0.6 462->464 465 pca_comp_73 <= -0.0573 mse = 0.1875 samples = 4 value = -0.75 464->465 468 mse = 0.0 samples = 1 value = 0.0 464->468 466 mse = 0.0 samples = 1 value = 0.0 465->466 467 mse = 0.0 samples = 3 value = -1.0 465->467 471 pca_comp_78 <= 0.0255 mse = 0.2284 samples = 17 value = -0.6471 470->471 494 pca_comp_72 <= -0.429 mse = 0.2344 samples = 16 value = -0.375 470->494 472 mse = 0.0 samples = 1 value = 0.0 471->472 473 pca_comp_73 <= -0.2008 mse = 0.2148 samples = 16 value = -0.6875 471->473 474 pca_comp_72 <= -0.4008 mse = 0.2222 samples = 15 value = -0.6667 473->474 493 mse = 0.0 samples = 1 value = -1.0 473->493 475 pca_comp_75 <= 0.1663 mse = 0.2367 samples = 13 value = -0.6154 474->475 492 mse = 0.0 samples = 2 value = -1.0 474->492 476 pca_comp_78 <= 0.0432 mse = 0.2222 samples = 12 value = -0.6667 475->476 491 mse = 0.0 samples = 1 value = 0.0 475->491 477 pca_comp_76 <= -0.0097 mse = 0.1875 samples = 4 value = -0.75 476->477 482 pca_comp_73 <= -0.2134 mse = 0.2344 samples = 8 value = -0.625 476->482 478 mse = 0.0 samples = 2 value = -1.0 477->478 479 pca_comp_73 <= -0.2075 mse = 0.25 samples = 2 value = -0.5 477->479 480 mse = 0.0 samples = 1 value = -1.0 479->480 481 mse = 0.0 samples = 1 value = 0.0 479->481 483 mse = 0.0 samples = 2 value = -1.0 482->483 484 pca_comp_76 <= -0.0378 mse = 0.25 samples = 6 value = -0.5 482->484 485 pca_comp_78 <= 0.0472 mse = 0.1875 samples = 4 value = -0.75 484->485 490 mse = 0.0 samples = 2 value = 0.0 484->490 486 pca_comp_73 <= -0.2022 mse = 0.25 samples = 2 value = -0.5 485->486 489 mse = 0.0 samples = 2 value = -1.0 485->489 487 mse = 0.0 samples = 1 value = 0.0 486->487 488 mse = 0.0 samples = 1 value = -1.0 486->488 495 mse = 0.0 samples = 3 value = 0.0 494->495 496 pca_comp_78 <= 0.0484 mse = 0.2485 samples = 13 value = -0.4615 494->496 497 pca_comp_76 <= -0.0405 mse = 0.1875 samples = 4 value = -0.75 496->497 502 pca_comp_78 <= 0.0488 mse = 0.2222 samples = 9 value = -0.3333 496->502 498 mse = 0.0 samples = 2 value = -1.0 497->498 499 pca_comp_73 <= -0.1942 mse = 0.25 samples = 2 value = -0.5 497->499 500 mse = 0.0 samples = 1 value = -1.0 499->500 501 mse = 0.0 samples = 1 value = 0.0 499->501 503 pca_comp_78 <= 0.0487 mse = 0.25 samples = 6 value = -0.5 502->503 508 mse = 0.0 samples = 3 value = 0.0 502->508 504 pca_comp_75 <= 0.1585 mse = 0.1875 samples = 4 value = -0.25 503->504 507 mse = 0.0 samples = 2 value = -1.0 503->507 505 mse = 0.0 samples = 3 value = 0.0 504->505 506 mse = 0.0 samples = 1 value = -1.0 504->506 510 mse = 0.0 samples = 1070 value = -1.0 509->510 511 mse = 0.0 samples = 2 value = 0.0 509->511 513 mse = 0.0 samples = 11922 value = 0.0 512->513 514 VIN_fracReturned1DivReturnedNotNull <= 0.8321 mse = 0.0355 samples = 3129 value = 0.9821 512->514 515 pca_comp_75 <= -0.4135 mse = 0.6522 samples = 39 value = 0.5897 514->515 518 pca_comp_75 <= -0.5156 mse = 0.0257 samples = 3090 value = 0.9871 514->518 516 mse = 0.0 samples = 8 value = -1.0 515->516 517 mse = 0.0 samples = 31 value = 1.0 515->517 519 mse = 0.0 samples = 20 value = -1.0 518->519 520 mse = 0.0 samples = 3070 value = 1.0 518->520
In [58]:
# data raw, data informative, data pca-kmeans, data feature importance
df_backup.shape, df_features.shape, df_features2.shape, df_features3.shape, df_features4.shape
Out[58]:
((278337, 25), (278337, 89), (278337, 184), (278337, 59), (278337, 10))

Test pipeline model

In [68]:
df['RiskyDealerScore'] = demo.app_predict.predict.create_pipeline_model_new_data(df=df, path_data_dir=path_data_dir, show_plots=False)
################################################################################
Features:
['Arbitrated', 'Autocheck_score', 'BuyerID_fracDSEligible1DivTransactions', 'BuyerID_fracReturned1DivReturnedNotNull', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'BuyerID_fracReturnedasm1DivTransactions', 'BuyerID_numDSEligible1', 'BuyerID_numReturned1', 'BuyerID_numReturnedNotNull', 'BuyerID_numReturnedasm1', 'BuyerID_numTransactions', 'CarMake_fracDSEligible1DivTransactions', 'CarMake_fracReturned1DivReturnedNotNull', 'CarMake_fracReturnedNotNullDivDSEligible1', 'CarMake_fracReturnedasm1DivTransactions', 'CarMake_numDSEligible1', 'CarMake_numReturned1', 'CarMake_numReturnedNotNull', 'CarMake_numReturnedasm1', 'CarMake_numTransactions', 'CarYear', 'ConditionReport', 'DSEligible', 'InLane', 'JDPowersCat_COMPACTCAR', 'JDPowersCat_EXCLUDED', 'JDPowersCat_FULLSIZECAR', 'JDPowersCat_LUXURYCAR', 'JDPowersCat_MIDSIZECAR', 'JDPowersCat_PICKUP', 'JDPowersCat_SPORTSCAR', 'JDPowersCat_SUV', 'JDPowersCat_UNKNOWN', 'JDPowersCat_VAN', 'JDPowersCat_fracDSEligible1DivTransactions', 'JDPowersCat_fracReturned1DivReturnedNotNull', 'JDPowersCat_fracReturnedNotNullDivDSEligible1', 'JDPowersCat_fracReturnedasm1DivTransactions', 'JDPowersCat_numDSEligible1', 'JDPowersCat_numReturned1', 'JDPowersCat_numReturnedNotNull', 'JDPowersCat_numReturnedasm1', 'JDPowersCat_numTransactions', 'LIGHTG', 'LIGHTR', 'LIGHTY', 'LIGHT_N0G1Y2R3', 'MMR', 'Mileage', 'OVE', 'PSI', 'PSIEligible', 'Returned_asm', 'SaleDate_day', 'SaleDate_decyear', 'SaleDate_dow', 'SaleDate_doy', 'SalePrice', 'Salvage', 'SellerID_fracDSEligible1DivTransactions', 'SellerID_fracReturned1DivReturnedNotNull', 'SellerID_fracReturnedNotNullDivDSEligible1', 'SellerID_fracReturnedasm1DivTransactions', 'SellerID_numDSEligible1', 'SellerID_numReturned1', 'SellerID_numReturnedNotNull', 'SellerID_numReturnedasm1', 'SellerID_numTransactions', 'SellingLocation_fracDSEligible1DivTransactions', 'SellingLocation_fracReturned1DivReturnedNotNull', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'SellingLocation_fracReturnedasm1DivTransactions', 'SellingLocation_lat', 'SellingLocation_lon', 'SellingLocation_numDSEligible1', 'SellingLocation_numReturned1', 'SellingLocation_numReturnedNotNull', 'SellingLocation_numReturnedasm1', 'SellingLocation_numTransactions', 'Simulcast', 'VIN_fracDSEligible1DivTransactions', 'VIN_fracReturned1DivReturnedNotNull', 'VIN_fracReturnedNotNullDivDSEligible1', 'VIN_fracReturnedasm1DivTransactions', 'VIN_numDSEligible1', 'VIN_numReturned1', 'VIN_numReturnedNotNull', 'VIN_numReturnedasm1', 'VIN_numTransactions']

################################################################################
`Container`: Create an empty container class and
dynamically allocate attributes to hold variables for specific steps
of the pipeline. 
`step.s0.[df,ds]_[features,target]`: Save initial state of features, target.

################################################################################
`transformer_scaler`, `transformer_pca`: Load existing transformers.

Time elapsed (sec) = 0.5

################################################################################
`transformer_kmeans`, `transformer_kmeans_pca`:
Predict centroid clusters.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
    
Time elapsed (sec) = 0.5

################################################################################
`df_features2`: Combine `df_features` with
cluster labels, cluster distances, PCA components, PCA cluster labels,
and PCA cluster distances into `df_features`.
Time elapsed (sec) = 1.3

################################################################################
Predict target values,
    plot actual vs predicted and score.
    
`features.pkl`, `estimator.pkl`: Save features and estimator.
    
Model score = 0.999
Time elapsed (sec) = 2.5

In [73]:
demo.app_predict.predict.plot_model(df=df, path_plot_dir=path_plot_dir)
In [13]:
buyerid = '18584'
col = 'Returned'
print('buyerid:', buyerid)
sns.distplot(df.loc[df['BuyerID']==buyerid, col], kde=False)
plt.title(col+' frequency distribution\nfor BuyerID='+buyerid)
plt.xlabel(col)
plt.ylabel('Number of transactions\nwith '+col+' = X')
plt.savefig(
    os.path.join(path_data_dir, 'plot_model', 'model_returned-freq-dist_for_'+buyerid+'_before.png'),
    dpi=300)
print(df.loc[df['BuyerID']==buyerid, col].mean(),
      df.loc[df['BuyerID']==buyerid, col].std())
buyerid: 18584
0.010752688172 0.697402495844
In [14]:
buyerid = '18584'
col = 'Returned'
print('buyerid:', buyerid)
sns.distplot(df_modl.loc[df_modl['BuyerID']==buyerid, col], kde=False)
plt.title(col+' frequency distribution\nfor BuyerID='+buyerid)
plt.xlabel(col)
plt.ylabel('Number of transactions\nwith '+col+' = X')
plt.savefig(
    os.path.join(path_data_dir, 'plot_model', 'model_returned-freq-dist_for_'+buyerid+'_after.png'),
    dpi=300)
print(df_modl.loc[df_modl['BuyerID']==buyerid, col].mean(),
      df_modl.loc[df_modl['BuyerID']==buyerid, col].std())
buyerid: 18584
-0.210810810811 0.662580505274
In [15]:
buyerid = '272356'
col = 'Returned'
print('buyerid:', buyerid)
sns.distplot(df.loc[df['BuyerID']==buyerid, col], kde=False)
plt.title(col+' frequency distribution\nfor BuyerID='+buyerid)
plt.xlabel(col)
plt.ylabel('Number of transactions\nwith '+col+' = X')
plt.savefig(
    os.path.join(path_data_dir, 'plot_model', 'model_returned-freq-dist_for_'+buyerid+'_before.png'),
    dpi=300)
print(df.loc[df['BuyerID']==buyerid, col].mean(),
      df.loc[df['BuyerID']==buyerid, col].std())
buyerid: 272356
-0.495284552846 0.561356298
In [16]:
buyerid = '272356'
col = 'Returned'
print('buyerid:', buyerid)
sns.distplot(df_modl.loc[df_modl['BuyerID']==buyerid, col], kde=False)
plt.title(col+' frequency distribution\nfor BuyerID='+buyerid)
plt.xlabel(col)
plt.ylabel('Number of transactions\nwith '+col+' = X')
plt.savefig(
    os.path.join(path_data_dir, 'plot_model', 'model_returned-freq-dist_for_'+buyerid+'_after.png'),
    dpi=300)
print(df_modl.loc[df_modl['BuyerID']==buyerid, col].mean(),
      df_modl.loc[df_modl['BuyerID']==buyerid, col].std())
buyerid: 272356
-0.519349593496 0.539162070427
In [17]:
buyerid = '328701'
col = 'Returned'
print('buyerid:', buyerid)
sns.distplot(df.loc[df['BuyerID']==buyerid, col], kde=False)
plt.title(col+' frequency distribution\nfor BuyerID='+buyerid)
plt.xlabel(col)
plt.ylabel('Number of transactions\nwith '+col+' = X')
plt.savefig(
    os.path.join(path_data_dir, 'plot_model', 'model_returned-freq-dist_for_'+buyerid+'_before.png'),
    dpi=300)
print(df.loc[df['BuyerID']==buyerid, col].mean(),
      df.loc[df['BuyerID']==buyerid, col].std())
buyerid: 328701
0.329966329966 0.787708144229
In [18]:
buyerid = '328701'
col = 'Returned'
print('buyerid:', buyerid)
sns.distplot(df_modl.loc[df_modl['BuyerID']==buyerid, col], kde=False)
plt.title(col+' frequency distribution\nfor BuyerID='+buyerid)
plt.xlabel(col)
plt.ylabel('Number of transactions\nwith '+col+' = X')
plt.savefig(
    os.path.join(path_data_dir, 'plot_model', 'model_returned-freq-dist_for_'+buyerid+'_after.png'),
    dpi=300)
print(df_modl.loc[df_modl['BuyerID']==buyerid, col].mean(),
      df_modl.loc[df_modl['BuyerID']==buyerid, col].std())
buyerid: 328701
-0.3367003367 0.784843688883
In [ ]:
 
In [ ]:
 
In [7]:
# For by-buyers
# remember_buyers = True
date_freq = 'W'

date_range = pd.date_range(start=df['SaleDate'].min(), end=df['SaleDate'].max(), freq=date_freq)
df_modl = df.loc[
    np.logical_and(
        date_range[0] <= df['SaleDate'],
        df['SaleDate'] < date_range[1])].copy()
df_orig = df.loc[
    np.logical_and(
        date_range[0] <= df['SaleDate'],
        df['SaleDate'] < date_range[1])].copy()
buyers = np.asarray([])
buyers_prohibited = dict() # key: (df_train['SaleDate'].min(), df_train['SaleDate'].max())
transactions_affected = dict() # key: (df_test['SaleDate'].min(), df_test['SaleDate'].max())
retrates_modl_chunk = dict() # key: (df_test['SaleDate'].min(), df_test['SaleDate'].max())
retrates_orig_chunk = dict() # key: (df_eval['SaleDate'].min(), df_eval['SaleDate'].max())
retrates_modl_all = dict() # key: (df_modl['SaleDate'].min(), df_modl['SaleDate'].max())
retrates_orig_all = dict() # key: (df_orig['SaleDate'].min(), df_orig['SaleDate'].max())

for idx in range(len(date_range)-2):
    print('#'*40)
    print('Timestamp:', time.strftime(r'%Y-%m-%dT%H:%M:%S%Z', time.gmtime()))
    
    # Define data sets.
    (saledate_train_min, saledate_train_max) = (date_range[idx],   date_range[idx+1])
    (saledate_test_min,  saledate_test_max)  = (date_range[idx+1], date_range[idx+2])
    (saledate_eval_min,  saledate_eval_max)  = (date_range[idx+1], date_range[idx+2])
    df_train = df_modl.loc[
        np.logical_and(
            saledate_train_min <= df_modl['SaleDate'],
            df_modl['SaleDate'] < saledate_train_max)].copy()
    df_test = df.loc[
        np.logical_and(
            saledate_test_min <= df['SaleDate'],
            df['SaleDate'] < saledate_test_max)].copy()
    df_eval = df.loc[
        np.logical_and(
            saledate_eval_min <= df['SaleDate'],
            df['SaleDate'] < saledate_eval_max)].copy()
    
    # Train model on old data.
    demo.app_predict.predict.create_pipeline_model(
        df=df_train, path_data_dir=path_data_dir, show_plots=False)
    # Use existing model to predict RiskyDealerScore for new data.
    # RiskyDealerScore is predictions of Returned.
    ds_rds = demo.app_predict.predict.create_pipeline_model_new_data(
        df=df_test, path_data_dir=path_data_dir, show_plots=False)    

    #     # For by-buyer
    #     # Prohibit purchase of DealShield in test data if BuyerID has predicted mean(RiskyDealerScore) > 0:
    #     # If prohibiting purchase of DealShield,
    #     # then for prohibed buyers, set DSEligible = 0 and Returned = -1 in the test data.
    #     # Do not remember buyers_prohibited.
    #     df_tmp = df_train.append(df_test)
    #     df_tmp.loc[ds_rds.index, 'Returned'] = ds_rds
    #     buyer_rds = df_tmp[['BuyerID', 'Returned']].groupby(by='BuyerID').mean()
    #     if remember_buyers:
    #         buyers = np.unique(
    #             np.append(
    #                 buyers,
    #                 buyer_rds.loc[buyer_rds['Returned'] > 0].index.values))
    #     else:
    #         buyers = buyer_rds.loc[buyer_rds['Returned'] > 0].index.values
    #     buyers_prohibited[(df_train['SaleDate'].min(), df_train['SaleDate'].max())] = buyers
    #     tfmask = np.logical_and(
    #         np.logical_and(
    #             df_test['SaleDate'] > df_train['SaleDate'].max(),
    #             df_test['BuyerID'].isin(buyers)),
    #         df_test['DSEligible'] == 1)    
    # For by-transaction
    # Prohibit purchase of DealShield in test data if transaction has predicted RiskyDealerScore > 0:
    # If prohibiting purchase of DealShield,
    # Then for prohibited transactions, set DSEligible = 0 and Returned = -1 in the test data.
    tfmask = ds_rds > 0
    transactions_affected[(df_test['SaleDate'].min(), df_test['SaleDate'].max())] = tfmask.loc[tfmask].index
    df_test.loc[tfmask, 'DSEligible'] = 0
    df_test.loc[tfmask, 'Returned'] = -1

    # Update calculated features.
    df_modl = demo.app_predict.predict.update_features_append(
        df_prev=df_modl, df_next=df_test, debug=False)
    df_orig = df_orig.append(df_eval)
    
    # Calculate chunk return rates.
    df_chunk = df_modl.loc[
        np.logical_and(
            saledate_test_min <= df_modl['SaleDate'],
            df_modl['SaleDate'] < saledate_test_max)]
    retrate_modl = sum(df_chunk['Returned']==1)/sum(df_chunk['Returned']!=-1)
    df_chunk = df_orig.loc[
        np.logical_and(
            saledate_eval_min <= df_orig['SaleDate'],
            df_orig['SaleDate'] < saledate_eval_max)]
    retrate_orig = sum(df_chunk['Returned']==1)/sum(df_chunk['Returned']!=-1)
    retrates_modl_chunk[(saledate_test_min, saledate_test_max)] = retrate_modl
    retrates_orig_chunk[(saledate_eval_min, saledate_eval_max)] = retrate_orig
    print('Chunk:')
    print('Evaluation time span:', df_chunk['SaleDate'].min(), df_chunk['SaleDate'].max())
    print('Model return rate:   ', retrate_modl)
    print('Original return rate:', retrate_orig)

    # Calculate overall return rates.
    retrate_modl = sum(df_modl['Returned']==1)/sum(df_modl['Returned']!=-1)
    retrate_orig = sum(df_orig['Returned']==1)/sum(df_orig['Returned']!=-1)
    retrates_modl_all[(df_modl['SaleDate'].min(), df_modl['SaleDate'].max())] = retrate_modl
    retrates_orig_all[(df_orig['SaleDate'].min(), df_orig['SaleDate'].max())] = retrate_orig
    print('Overall:')
    print('Evaluation time span:', df_orig['SaleDate'].min(), df_orig['SaleDate'].max())
    print('Model return rate:   ', retrate_modl)
    print('Original return rate:', retrate_orig)
    gc.collect()
########################################
Timestamp: 2017-03-02T10:06:28GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
[]
Summarize top 5 important features:
important_features2 is empty
Important features list is empty.
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 1.000
Chunk:
Evaluation time span: 2013-02-10 00:00:00 2013-02-16 00:00:00
Model return rate:    nan
Original return rate: nan
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-02-16 00:00:00
Model return rate:    nan
Original return rate: nan
########################################
Timestamp: 2017-03-02T10:06:31GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
[]
Summarize top 5 important features:
important_features2 is empty
Important features list is empty.
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 1.000
Chunk:
Evaluation time span: 2013-02-17 00:00:00 2013-02-23 00:00:00
Model return rate:    nan
Original return rate: nan
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-02-23 00:00:00
Model return rate:    nan
Original return rate: nan
########################################
Timestamp: 2017-03-02T10:06:35GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
[]
Summarize top 5 important features:
important_features2 is empty
Important features list is empty.
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 1.000
Chunk:
Evaluation time span: 2013-02-24 00:00:00 2013-03-02 00:00:00
Model return rate:    nan
Original return rate: nan
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-03-02 00:00:00
Model return rate:    nan
Original return rate: nan
########################################
Timestamp: 2017-03-02T10:06:38GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
[]
Summarize top 5 important features:
important_features2 is empty
Important features list is empty.
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 1.000
Chunk:
Evaluation time span: 2013-03-03 00:00:00 2013-03-09 00:00:00
Model return rate:    nan
Original return rate: nan
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-03-09 00:00:00
Model return rate:    nan
Original return rate: nan
########################################
Timestamp: 2017-03-02T10:06:42GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
[]
Summarize top 5 important features:
important_features2 is empty
Important features list is empty.
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 1.000
Chunk:
Evaluation time span: 2013-03-10 00:00:00 2013-03-16 00:00:00
Model return rate:    nan
Original return rate: nan
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-03-16 00:00:00
Model return rate:    nan
Original return rate: nan
########################################
Timestamp: 2017-03-02T10:06:45GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
[]
Summarize top 5 important features:
important_features2 is empty
Important features list is empty.
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = -0.001
Chunk:
Evaluation time span: 2013-03-17 00:00:00 2013-03-23 00:00:00
Model return rate:    0.0
Original return rate: 0.0
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-03-23 00:00:00
Model return rate:    0.0
Original return rate: 0.0
########################################
Timestamp: 2017-03-02T10:06:49GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_numReturnedNotNull', 'VIN_fracReturnedNotNullDivDSEligible1', 'pca_comp_48', 'pca_comp_43']
Summarize top 5 important features:
       VIN_numReturnedNotNull  VIN_fracReturnedNotNullDivDSEligible1  \
count             6305.000000                            6305.000000   
mean                 0.000952                               0.000952   
std                  0.030836                               0.030836   
min                  0.000000                               0.000000   
25%                  0.000000                               0.000000   
50%                  0.000000                               0.000000   
75%                  0.000000                               0.000000   
max                  1.000000                               1.000000   

        pca_comp_48   pca_comp_43  
count  6.305000e+03  6.305000e+03  
mean   1.800997e-18  3.673597e-18  
std    3.204550e-02  9.135607e-02  
min   -5.603634e-01 -2.859878e-01  
25%   -1.794282e-03 -1.824042e-02  
50%    1.207841e-03 -5.065691e-03  
75%    4.113957e-03  7.370712e-03  
max    7.194844e-01  2.257620e+00  
Progress: 20% 40% 60% 80% 100% 

Model score = 1.000
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.874
Chunk:
Evaluation time span: 2013-03-25 00:00:00 2013-03-30 00:00:00
Model return rate:    0.0
Original return rate: 0.0
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-03-30 00:00:00
Model return rate:    0.0
Original return rate: 0.0
########################################
Timestamp: 2017-03-02T10:06:54GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'pca_comp_42', 'pca_comp_48', 'pca_comp_37', 'VIN_numDSEligible1', 'pca_comp_54', 'VIN_numTransactions', 'pca_comp_51', 'pca_comp_41', 'SellerID_fracReturnedNotNullDivDSEligible1', 'SellerID_numDSEligible1', 'SellerID_numReturnedNotNull', 'SellingLocation_numReturnedasm1', 'pca_comp_29', 'pca_comp_03', 'pca_comp_11', 'pca_comp_53', 'pca_comp_30', 'pca_comp_35', 'pca_comp_36', 'pca_comp_38', 'pca_comp_49', 'BuyerID_numReturnedNotNull']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            5667.000000             5667.000000   
mean                                0.001147                0.001235   
std                                 0.033193                0.035127   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                1.000000   

        pca_comp_42   pca_comp_48   pca_comp_37  
count  5.667000e+03  5.667000e+03  5.667000e+03  
mean   1.088771e-17 -8.494543e-20 -8.878405e-18  
std    1.068222e-01  3.595365e-02  1.933022e-01  
min   -2.719575e-01 -6.844153e-01 -7.311452e-01  
25%   -3.173419e-02 -1.569564e-03 -5.942529e-02  
50%   -8.505077e-03  1.746450e-03 -9.108867e-03  
75%    1.450669e-02  4.591372e-03  4.494162e-02  
max    3.082570e+00  9.875722e-01  2.116968e+00  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.933
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.915
Chunk:
Evaluation time span: 2013-03-31 00:00:00 2013-04-06 00:00:00
Model return rate:    0.0263157894737
Original return rate: 0.0263157894737
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-04-06 00:00:00
Model return rate:    0.02
Original return rate: 0.02
########################################
Timestamp: 2017-03-02T10:06:59GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_numReturnedNotNull', 'VIN_fracReturnedNotNullDivDSEligible1', 'Returned_asm', 'BuyerID_fracReturned1DivReturnedNotNull', 'pca_comp_60', 'BuyerID_numReturned1', 'pca_comp_56', 'SellerID_fracReturned1DivReturnedNotNull', 'SellerID_numReturned1', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_62', 'pca_comp_52', 'VIN_numReturned1', 'VIN_numReturnedasm1', 'pca_comp_29', 'pca_comp_53', 'pca_comp_61', 'CarMake_fracReturned1DivReturnedNotNull', 'JDPowersCat_fracReturned1DivReturnedNotNull', 'pca_comp_57', 'VIN_fracReturnedasm1DivTransactions', 'pca_comp_44']
Summarize top 5 important features:
       VIN_numReturnedNotNull  VIN_fracReturnedNotNullDivDSEligible1  \
count             5606.000000                            5606.000000   
mean                 0.006778                               0.006422   
std                  0.082059                               0.078760   
min                  0.000000                               0.000000   
25%                  0.000000                               0.000000   
50%                  0.000000                               0.000000   
75%                  0.000000                               0.000000   
max                  1.000000                               1.000000   

       Returned_asm  BuyerID_fracReturned1DivReturnedNotNull   pca_comp_60  
count   5606.000000                              5606.000000  5.606000e+03  
mean       0.052265                                 0.000178  4.017610e-17  
std        0.222582                                 0.009443  1.193985e-02  
min        0.000000                                 0.000000 -5.619663e-01  
25%        0.000000                                 0.000000 -1.621865e-03  
50%        0.000000                                 0.000000  6.944911e-06  
75%        0.000000                                 0.000000  1.739356e-03  
max        1.000000                                 0.500000  2.592061e-01  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.975
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 1.000
Chunk:
Evaluation time span: 2013-04-07 00:00:00 2013-04-13 00:00:00
Model return rate:    0.0
Original return rate: 0.0
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-04-13 00:00:00
Model return rate:    0.0103092783505
Original return rate: 0.0103092783505
########################################
Timestamp: 2017-03-02T10:07:05GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_numReturnedNotNull', 'VIN_fracReturnedNotNullDivDSEligible1']
Summarize top 5 important features:
       VIN_numReturnedNotNull  VIN_fracReturnedNotNullDivDSEligible1
count             6090.000000                            6090.000000
mean                 0.007718                               0.007635
std                  0.087517                               0.086818
min                  0.000000                               0.000000
25%                  0.000000                               0.000000
50%                  0.000000                               0.000000
75%                  0.000000                               0.000000
max                  1.000000                               1.000000
Progress: 20% 40% 60% 80% 100% 

Model score = 0.998
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.928
Chunk:
Evaluation time span: 2013-04-14 00:00:00 2013-04-19 00:00:00
Model return rate:    0.0535714285714
Original return rate: 0.0535714285714
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-04-19 00:00:00
Model return rate:    0.0261437908497
Original return rate: 0.0261437908497
########################################
Timestamp: 2017-03-02T10:07:10GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'VIN_fracReturned1DivReturnedNotNull', 'VIN_numReturnedasm1', 'VIN_fracReturnedasm1DivTransactions', 'SellerID_numReturned1', 'Returned_asm', 'VIN_numReturned1', 'VIN_numDSEligible1', 'pca_comp_40', 'VIN_numTransactions', 'pca_comp_42', 'pca_comp_57', 'pca_comp_62', 'LIGHTG', 'BuyerID_fracReturned1DivReturnedNotNull', 'pca_comp_45', 'pca_comp_46', 'pca_comp_56', 'SellerID_fracReturned1DivReturnedNotNull', 'pca_comp_13', 'pca_comp_38', 'pca_comp_58', 'pca_comp_35', 'pca_comp_26', 'SellerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_16', 'SalePrice', 'CarMake_fracReturned1DivReturnedNotNull', 'JDPowersCat_fracDSEligible1DivTransactions', 'JDPowersCat_MIDSIZECAR', 'pca_comp_61', 'SellerID_numDSEligible1', 'SellerID_numTransactions', 'CarMake_fracReturnedasm1DivTransactions', 'CarYear', 'pca_comp_02', 'pca_comp_17', 'pca_comp_68', 'pca_comp_20', 'pca_comp_53', 'CarMake_fracDSEligible1DivTransactions', 'pca_comp_63', 'pca_comp_59', 'JDPowersCat_fracReturnedasm1DivTransactions', 'pca_comp_39', 'pca_comp_36', 'pca_comp_19', 'pca_comp_11', 'pca_comp_07', 'pca_comp_43', 'LIGHT_N0G1Y2R3', 'pca_comp_34', 'JDPowersCat_numReturnedasm1', 'CarMake_numDSEligible1', 'pca_comp_70', 'SellingLocation_fracReturnedasm1DivTransactions', 'pca_comp_44', 'pca_comp_51', 'pca_comp_08', 'BuyerID_fracDSEligible1DivTransactions']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            5720.000000             5720.000000   
mean                                0.009965                0.010315   
std                                 0.098451                0.101045   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                1.000000   

       VIN_fracReturned1DivReturnedNotNull  VIN_numReturnedasm1  \
count                          5720.000000          5720.000000   
mean                              0.000524             0.052797   
std                               0.022897             0.225981   
min                               0.000000             0.000000   
25%                               0.000000             0.000000   
50%                               0.000000             0.000000   
75%                               0.000000             0.000000   
max                               1.000000             2.000000   

       VIN_fracReturnedasm1DivTransactions  
count                          5720.000000  
mean                              0.051573  
std                               0.220392  
min                               0.000000  
25%                               0.000000  
50%                               0.000000  
75%                               0.000000  
max                               1.000000  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.968
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.932
Chunk:
Evaluation time span: 2013-04-21 00:00:00 2013-04-27 00:00:00
Model return rate:    0.0
Original return rate: 0.0576923076923
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-04-27 00:00:00
Model return rate:    0.02
Original return rate: 0.0341463414634
########################################
Timestamp: 2017-03-02T10:07:16GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_numReturnedNotNull', 'VIN_fracReturnedNotNullDivDSEligible1', 'pca_comp_44', 'pca_comp_27', 'pca_comp_06', 'pca_comp_37', 'pca_comp_63', 'JDPowersCat_fracReturnedNotNullDivDSEligible1', 'VIN_numDSEligible1', 'pca_comp_60', 'BuyerID_fracDSEligible1DivTransactions', 'pca_comp_19', 'pca_comp_66', 'BuyerID_fracReturnedasm1DivTransactions', 'pca_comp_12', 'BuyerID_numReturnedNotNull', 'pca_comp_17', 'pca_comp_04', 'pca_comp_01', 'pca_comp_05', 'pca_comp_65', 'SalePrice', 'pca_comp_02', 'pca_comp_16', 'pca_comp_22', 'pca_comp_35', 'VIN_numTransactions', 'pca_comp_24', 'ConditionReport', 'pca_comp_00', 'pca_comp_36', 'SellingLocation_fracDSEligible1DivTransactions', 'pca_comp_34', 'JDPowersCat_fracReturned1DivReturnedNotNull', 'pca_comp_49', 'pca_cluster_1_dist', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_47', 'pca_comp_31', 'pca_comp_30', 'pca_comp_64', 'SellingLocation_numReturnedasm1', 'MMR', 'CarMake_fracReturnedasm1DivTransactions', 'pca_comp_28', 'JDPowersCat_numReturned1', 'CarMake_numDSEligible1', 'CarMake_numTransactions', 'pca_comp_11', 'CarMake_fracReturnedNotNullDivDSEligible1', 'Autocheck_score', 'JDPowersCat_fracDSEligible1DivTransactions', 'SellingLocation_numReturnedNotNull', 'pca_cluster_0_dist', 'pca_comp_68', 'pca_comp_20', 'pca_comp_71', 'pca_comp_38']
Summarize top 5 important features:
       VIN_numReturnedNotNull  VIN_fracReturnedNotNullDivDSEligible1  \
count             6074.000000                            6074.000000   
mean                 0.008232                               0.007601   
std                  0.090363                               0.085236   
min                  0.000000                               0.000000   
25%                  0.000000                               0.000000   
50%                  0.000000                               0.000000   
75%                  0.000000                               0.000000   
max                  1.000000                               1.000000   

        pca_comp_44   pca_comp_27   pca_comp_06  
count  6.074000e+03  6.074000e+03  6.074000e+03  
mean  -7.522428e-17  2.805488e-17 -2.402681e-17  
std    1.067775e-01  4.031969e-01  2.309402e+00  
min   -8.689928e-01 -2.956313e+00 -1.694126e+01  
25%   -2.626192e-02 -1.180146e-01 -3.569849e-01  
50%    2.941517e-04 -4.050110e-03  7.012193e-01  
75%    2.743399e-02  1.578955e-01  1.166400e+00  
max    1.281656e+00  1.778804e+00  1.337341e+01  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.979
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.862
Chunk:
Evaluation time span: 2013-04-28 00:00:00 2013-05-04 00:00:00
Model return rate:    0.0816326530612
Original return rate: 0.0816326530612
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-05-04 00:00:00
Model return rate:    0.0321285140562
Original return rate: 0.0433070866142
########################################
Timestamp: 2017-03-02T10:07:22GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'VIN_fracReturned1DivReturnedNotNull', 'VIN_fracReturnedasm1DivTransactions', 'VIN_numReturnedasm1', 'Returned_asm', 'VIN_numReturned1', 'VIN_numTransactions', 'JDPowersCat_COMPACTCAR', 'pca_comp_64', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'VIN_numDSEligible1', 'pca_comp_38', 'pca_comp_62', 'pca_comp_57', 'pca_comp_41', 'pca_comp_65', 'pca_comp_60', 'pca_comp_46', 'pca_comp_55', 'pca_comp_26', 'pca_comp_09', 'CarMake_fracReturnedNotNullDivDSEligible1', 'pca_comp_61', 'pca_comp_45', 'LIGHTG', 'pca_comp_39', 'pca_comp_28', 'pca_comp_10', 'pca_comp_25', 'pca_comp_50', 'SellerID_numReturnedNotNull', 'pca_comp_13', 'pca_comp_21', 'SellerID_numDSEligible1', 'pca_comp_48', 'pca_comp_40', 'CarMake_fracDSEligible1DivTransactions', 'pca_comp_17', 'pca_comp_53', 'Autocheck_score', 'SellingLocation_numTransactions', 'pca_comp_11', 'pca_comp_49', 'CarYear', 'pca_comp_52', 'pca_comp_58']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            5565.000000             5565.000000   
mean                                0.009224                0.010063   
std                                 0.093496                0.099817   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                1.000000   

       VIN_fracReturned1DivReturnedNotNull  \
count                          5565.000000   
mean                              0.000719   
std                               0.026803   
min                               0.000000   
25%                               0.000000   
50%                               0.000000   
75%                               0.000000   
max                               1.000000   

       VIN_fracReturnedasm1DivTransactions  VIN_numReturnedasm1  
count                          5565.000000          5565.000000  
mean                              0.045103             0.045642  
std                               0.206899             0.208727  
min                               0.000000             0.000000  
25%                               0.000000             0.000000  
50%                               0.000000             0.000000  
75%                               0.000000             0.000000  
max                               1.000000             1.000000  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.988
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.944
Chunk:
Evaluation time span: 2013-05-05 00:00:00 2013-05-11 00:00:00
Model return rate:    0.0462962962963
Original return rate: 0.274647887324
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-05-11 00:00:00
Model return rate:    0.0364145658263
Original return rate: 0.126262626263
########################################
Timestamp: 2017-03-02T10:07:29GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'VIN_fracReturned1DivReturnedNotNull', 'VIN_numReturned1', 'PSIEligible', 'Returned_asm', 'VIN_numReturnedasm1', 'VIN_fracReturnedasm1DivTransactions', 'VIN_numTransactions', 'pca_comp_62', 'cluster_1_dist', 'pca_cluster_0_dist', 'pca_comp_37', 'VIN_numDSEligible1', 'pca_comp_60', 'pca_comp_58', 'pca_comp_53', 'pca_comp_61', 'pca_comp_30', 'pca_comp_32', 'pca_comp_26', 'pca_comp_36', 'MMR', 'BuyerID_fracReturned1DivReturnedNotNull', 'pca_comp_49', 'pca_comp_13', 'pca_comp_68', 'SellerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_40', 'pca_comp_54', 'pca_comp_11', 'SalePrice', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_22', 'BuyerID_numReturnedNotNull', 'pca_comp_04', 'pca_comp_56', 'BuyerID_numReturned1', 'BuyerID_numDSEligible1', 'LIGHT_N0G1Y2R3', 'pca_comp_66', 'CarMake_fracReturnedNotNullDivDSEligible1', 'pca_comp_59', 'pca_comp_48', 'pca_comp_12', 'pca_cluster_1_dist']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            6193.000000             6193.000000   
mean                                0.017116                0.018892   
std                                 0.126599                0.137337   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                2.000000   

       VIN_fracReturned1DivReturnedNotNull  VIN_numReturned1  PSIEligible  
count                          6193.000000       6193.000000  6193.000000  
mean                              0.000807          0.000807     0.717261  
std                               0.028405          0.028405     0.450367  
min                               0.000000          0.000000     0.000000  
25%                               0.000000          0.000000     0.000000  
50%                               0.000000          0.000000     1.000000  
75%                               0.000000          0.000000     1.000000  
max                               1.000000          1.000000     1.000000  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.993
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.988
Chunk:
Evaluation time span: 2013-05-12 00:00:00 2013-05-18 00:00:00
Model return rate:    0.0
Original return rate: 0.269662921348
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-05-18 00:00:00
Model return rate:    0.0308056872038
Original return rate: 0.152577319588
########################################
Timestamp: 2017-03-02T10:07:35GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'pca_comp_39', 'pca_comp_59', 'VIN_numDSEligible1', 'VIN_numTransactions', 'SellingLocation_lat', 'SellingLocation_lon', 'SaleDate_decyear', 'SellingLocation_numReturned1', 'pca_comp_35', 'SellingLocation_numReturnedNotNull', 'pca_comp_34', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'SaleDate_dow', 'SellingLocation_numReturnedasm1', 'SellingLocation_numTransactions', 'SellingLocation_fracDSEligible1DivTransactions', 'SaleDate_doy', 'LIGHTR', 'SellerID_numReturned1', 'pca_comp_26', 'LIGHT_N0G1Y2R3', 'pca_comp_41', 'cluster_0_dist', 'SellingLocation_fracReturnedasm1DivTransactions', 'MMR', 'PSIEligible', 'CarYear', 'SellerID_fracReturned1DivReturnedNotNull', 'pca_comp_14', 'pca_comp_62', 'pca_comp_03', 'pca_comp_11', 'BuyerID_numReturnedNotNull', 'Autocheck_score', 'pca_comp_60', 'Mileage', 'SaleDate_day', 'SalePrice', 'pca_comp_19', 'BuyerID_fracDSEligible1DivTransactions', 'SellingLocation_numDSEligible1', 'pca_comp_15']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            5892.000000             5892.000000   
mean                                0.011301                0.012220   
std                                 0.103758                0.109876   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                1.000000   

        pca_comp_39   pca_comp_59  VIN_numDSEligible1  
count  5.892000e+03  5.892000e+03         5892.000000  
mean  -7.553054e-18  3.145844e-17            0.969280  
std    1.405811e-01  1.393887e-02            0.292334  
min   -7.222085e-01 -3.834046e-01            0.000000  
25%   -3.100796e-02 -1.797219e-03            1.000000  
50%   -7.999452e-03 -5.371319e-04            1.000000  
75%    1.288353e-02  8.922595e-04            1.000000  
max    1.658963e+00  9.674088e-02            4.000000  
Progress: 20% 40% 60% 80% 100% 

Model score = 1.000
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.850
Chunk:
Evaluation time span: 2013-05-19 00:00:00 2013-05-25 00:00:00
Model return rate:    0.185567010309
Original return rate: 0.185567010309
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-05-25 00:00:00
Model return rate:    0.0795454545455
Original return rate: 0.162002945508
########################################
Timestamp: 2017-03-02T10:07:41GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_numReturnedNotNull', 'VIN_fracReturnedNotNullDivDSEligible1', 'Returned_asm', 'pca_comp_62', 'VIN_numDSEligible1', 'pca_comp_63', 'pca_comp_35', 'VIN_numTransactions', 'pca_comp_66', 'SellerID_numReturned1', 'SellerID_fracReturned1DivReturnedNotNull', 'SellerID_numReturnedNotNull', 'SellingLocation_fracDSEligible1DivTransactions', 'SellingLocation_numReturnedNotNull', 'pca_comp_07', 'SellingLocation_fracReturnedasm1DivTransactions', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'SellingLocation_numReturned1', 'pca_comp_14', 'pca_comp_18', 'pca_comp_12', 'pca_comp_31', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_01', 'pca_comp_55', 'pca_comp_40', 'cluster_0_dist', 'SellerID_fracReturnedNotNullDivDSEligible1', 'SellerID_fracDSEligible1DivTransactions', 'SaleDate_doy', 'JDPowersCat_numReturned1']
Summarize top 5 important features:
       VIN_numReturnedNotNull  VIN_fracReturnedNotNullDivDSEligible1  \
count             5830.000000                            5830.000000   
mean                 0.034991                               0.033419   
std                  0.183774                               0.177597   
min                  0.000000                               0.000000   
25%                  0.000000                               0.000000   
50%                  0.000000                               0.000000   
75%                  0.000000                               0.000000   
max                  1.000000                               1.000000   

       Returned_asm   pca_comp_62  VIN_numDSEligible1  
count   5830.000000  5.830000e+03         5830.000000  
mean       0.054717  7.200739e-20            0.981990  
std        0.227446  2.133689e-02            0.282475  
min        0.000000 -4.286332e-01            0.000000  
25%        0.000000 -1.693213e-03            1.000000  
50%        0.000000 -6.199982e-04            1.000000  
75%        0.000000  5.975842e-04            1.000000  
max        1.000000  2.311126e-01            3.000000  
Progress: 20% 40% 60% 80% 100% 

Model score = 1.000
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.991
Chunk:
Evaluation time span: 2013-05-26 00:00:00 2013-05-31 00:00:00
Model return rate:    0.0
Original return rate: 0.298780487805
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-05-31 00:00:00
Model return rate:    0.0670314637483
Original return rate: 0.188612099644
########################################
Timestamp: 2017-03-02T10:07:48GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_numReturnedNotNull', 'VIN_fracReturnedNotNullDivDSEligible1', 'pca_comp_58', 'VIN_numTransactions', 'pca_comp_57', 'pca_comp_35', 'VIN_numDSEligible1', 'BuyerID_fracReturned1DivReturnedNotNull', 'pca_comp_37', 'pca_comp_52', 'pca_comp_47', 'pca_comp_53', 'pca_comp_38', 'pca_comp_48', 'pca_comp_49', 'BuyerID_numTransactions', 'BuyerID_numDSEligible1', 'pca_comp_17', 'pca_comp_01', 'pca_comp_04', 'pca_comp_16', 'pca_comp_54', 'pca_comp_65', 'pca_comp_55', 'pca_comp_03', 'pca_comp_09', 'VIN_numReturned1', 'VIN_fracReturnedasm1DivTransactions', 'pca_comp_08', 'pca_comp_50', 'pca_comp_61', 'pca_comp_68', 'CarMake_fracReturnedasm1DivTransactions', 'CarMake_fracDSEligible1DivTransactions']
Summarize top 5 important features:
       VIN_numReturnedNotNull  VIN_fracReturnedNotNullDivDSEligible1  \
count             4901.000000                            4901.000000   
mean                 0.024281                               0.023295   
std                  0.153935                               0.149361   
min                  0.000000                               0.000000   
25%                  0.000000                               0.000000   
50%                  0.000000                               0.000000   
75%                  0.000000                               0.000000   
max                  1.000000                               1.000000   

        pca_comp_58  VIN_numTransactions   pca_comp_57  
count  4.901000e+03          4901.000000  4.901000e+03  
mean  -2.741295e-17             1.032034 -3.025910e-16  
std    2.345620e-02             0.181811  2.492683e-02  
min   -5.172341e-01             1.000000 -2.294217e-01  
25%   -2.889731e-03             1.000000 -4.332962e-03  
50%   -7.620604e-04             1.000000 -7.799630e-04  
75%    1.693913e-03             1.000000  2.870260e-03  
max    1.769475e-01             3.000000  4.441703e-01  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.981
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.743
Chunk:
Evaluation time span: 2013-06-02 00:00:00 2013-06-08 00:00:00
Model return rate:    0.30243902439
Original return rate: 0.30243902439
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-06-08 00:00:00
Model return rate:    0.11858974359
Original return rate: 0.210877862595
########################################
Timestamp: 2017-03-02T10:07:54GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_numReturnedNotNull', 'VIN_fracReturnedNotNullDivDSEligible1', 'Returned_asm', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_39', 'VIN_numDSEligible1', 'VIN_numTransactions', 'pca_comp_61', 'pca_comp_64', 'Mileage', 'pca_comp_60', 'BuyerID_numReturnedNotNull', 'SellingLocation_fracReturned1DivReturnedNotNull', 'pca_comp_16', 'pca_comp_67', 'CarMake_numReturned1', 'BuyerID_fracReturned1DivReturnedNotNull', 'pca_comp_32', 'pca_comp_07', 'pca_comp_41', 'BuyerID_numDSEligible1', 'CarMake_fracReturned1DivReturnedNotNull', 'pca_comp_55', 'BuyerID_numTransactions', 'pca_comp_26', 'pca_comp_49', 'pca_comp_08', 'pca_comp_34', 'pca_comp_35', 'pca_comp_63', 'pca_comp_04', 'Arbitrated', 'pca_comp_30', 'Autocheck_score', 'pca_comp_70', 'pca_comp_13', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'SellingLocation_numReturnedNotNull', 'pca_comp_18', 'PSIEligible', 'pca_comp_21', 'pca_comp_23', 'pca_comp_20', 'JDPowersCat_fracDSEligible1DivTransactions', 'CarYear', 'CarMake_numReturnedasm1', 'CarMake_numReturnedNotNull', 'CarMake_fracDSEligible1DivTransactions', 'pca_comp_14', 'SellingLocation_fracDSEligible1DivTransactions', 'pca_comp_02', 'SellerID_fracReturnedNotNullDivDSEligible1', 'CarMake_fracReturnedNotNullDivDSEligible1', 'SellingLocation_numDSEligible1', 'pca_comp_51', 'pca_comp_58', 'pca_comp_47', 'SellerID_fracDSEligible1DivTransactions', 'pca_comp_00', 'BuyerID_numReturnedasm1', 'pca_comp_43', 'pca_comp_59', 'SaleDate_dow', 'LIGHTR', 'pca_comp_33', 'SellerID_numDSEligible1', 'SellerID_fracReturnedasm1DivTransactions', 'SellingLocation_lat', 'SellingLocation_numReturned1', 'pca_comp_36', 'PSI', 'SaleDate_doy', 'pca_comp_48', 'SellingLocation_fracReturnedasm1DivTransactions', 'pca_cluster_0_dist', 'BuyerID_fracDSEligible1DivTransactions', 'pca_comp_09', 'pca_comp_25', 'CarMake_numTransactions']
Summarize top 5 important features:
       VIN_numReturnedNotNull  VIN_fracReturnedNotNullDivDSEligible1  \
count             6361.000000                            6361.000000   
mean                 0.033485                               0.032005   
std                  0.179914                               0.174046   
min                  0.000000                               0.000000   
25%                  0.000000                               0.000000   
50%                  0.000000                               0.000000   
75%                  0.000000                               0.000000   
max                  1.000000                               1.000000   

       Returned_asm  BuyerID_fracReturnedNotNullDivDSEligible1   pca_comp_39  
count   6361.000000                                6361.000000  6.361000e+03  
mean       0.065556                                   0.008813 -2.967111e-18  
std        0.247523                                   0.040236  2.261858e-01  
min        0.000000                                   0.000000 -1.114342e+00  
25%        0.000000                                   0.000000 -7.745039e-02  
50%        0.000000                                   0.000000 -1.436190e-02  
75%        0.000000                                   0.000000  4.729452e-02  
max        1.000000                                   1.000000  3.368160e+00  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.997
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.982
Chunk:
Evaluation time span: 2013-06-10 00:00:00 2013-06-15 00:00:00
Model return rate:    0.0
Original return rate: 0.278195488722
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-06-15 00:00:00
Model return rate:    0.0984042553191
Original return rate: 0.224505327245
########################################
Timestamp: 2017-03-02T10:08:01GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_numReturnedNotNull', 'VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numDSEligible1', 'BuyerID_numReturnedNotNull', 'pca_comp_35', 'pca_comp_37', 'pca_comp_05', 'pca_comp_36', 'VIN_fracReturned1DivReturnedNotNull', 'BuyerID_fracReturned1DivReturnedNotNull', 'pca_comp_59', 'pca_comp_56', 'pca_comp_60', 'pca_comp_11', 'VIN_numReturned1', 'pca_comp_22', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_03', 'pca_comp_10', 'Autocheck_score', 'CarMake_fracDSEligible1DivTransactions', 'cluster_0_dist', 'pca_comp_15', 'VIN_numReturnedasm1', 'pca_comp_58', 'CarMake_fracReturnedasm1DivTransactions', 'pca_cluster_0_dist', 'pca_comp_41', 'pca_comp_31', 'pca_comp_54', 'Mileage', 'pca_comp_32', 'pca_comp_23', 'SellerID_numReturnedasm1', 'BuyerID_fracReturnedasm1DivTransactions', 'pca_comp_52', 'pca_comp_19', 'SellerID_fracReturned1DivReturnedNotNull', 'JDPowersCat_numReturnedNotNull', 'CarMake_numReturned1', 'pca_comp_65', 'pca_comp_33', 'CarMake_numReturnedNotNull', 'pca_comp_14', 'SellerID_numDSEligible1', 'pca_comp_27', 'pca_comp_43', 'SellerID_numTransactions', 'pca_comp_29', 'pca_comp_40', 'pca_comp_48', 'SellerID_fracDSEligible1DivTransactions', 'pca_comp_30', 'pca_comp_04', 'pca_comp_17', 'pca_comp_61', 'SellerID_fracReturnedasm1DivTransactions']
Summarize top 5 important features:
       VIN_numReturnedNotNull  VIN_fracReturnedNotNullDivDSEligible1  \
count             6134.000000                            6134.000000   
mean                 0.033094                               0.030758   
std                  0.178897                               0.169364   
min                  0.000000                               0.000000   
25%                  0.000000                               0.000000   
50%                  0.000000                               0.000000   
75%                  0.000000                               0.000000   
max                  1.000000                               1.000000   

       VIN_numDSEligible1  BuyerID_numReturnedNotNull   pca_comp_35  
count         6134.000000                 6134.000000  6.134000e+03  
mean             0.956798                    0.988099 -2.352934e-19  
std              0.308463                    3.925321  2.369513e-01  
min              0.000000                    0.000000 -1.014847e+00  
25%              1.000000                    0.000000 -1.442965e-01  
50%              1.000000                    0.000000 -3.722976e-02  
75%              1.000000                    0.000000  1.401617e-01  
max              3.000000                   60.000000  1.651857e+00  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.965
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.813
Chunk:
Evaluation time span: 2013-06-16 00:00:00 2013-06-22 00:00:00
Model return rate:    0.166153846154
Original return rate: 0.166153846154
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-06-22 00:00:00
Model return rate:    0.11355815554
Original return rate: 0.21293471629
########################################
Timestamp: 2017-03-02T10:08:07GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'Returned_asm', 'pca_comp_66', 'VIN_fracReturnedasm1DivTransactions', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_62', 'pca_comp_64', 'pca_comp_46', 'pca_comp_10', 'pca_comp_68', 'pca_comp_50', 'pca_comp_36', 'pca_comp_65', 'pca_comp_42', 'Arbitrated', 'pca_cluster_0_dist', 'JDPowersCat_COMPACTCAR', 'BuyerID_numReturnedNotNull', 'BuyerID_fracDSEligible1DivTransactions', 'pca_comp_00', 'pca_comp_61', 'SellerID_numReturnedasm1', 'cluster_1_dist', 'pca_comp_57', 'pca_cluster_1_dist', 'CarYear', 'pca_comp_11', 'pca_comp_59', 'cluster_0_dist', 'pca_comp_58', 'BuyerID_fracReturnedasm1DivTransactions', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'SellerID_fracDSEligible1DivTransactions', 'pca_comp_21', 'pca_comp_48', 'pca_comp_39', 'pca_comp_54', 'SellerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_37', 'pca_comp_06', 'SellerID_numDSEligible1', 'SellingLocation_numReturned1', 'pca_comp_08', 'JDPowersCat_fracReturnedNotNullDivDSEligible1', 'pca_comp_24', 'DSEligible', 'pca_comp_22', 'pca_comp_40', 'CarMake_fracReturnedasm1DivTransactions', 'pca_comp_18', 'VIN_numReturned1', 'pca_comp_16', 'JDPowersCat_VAN', 'pca_comp_69', 'pca_comp_34', 'pca_comp_05', 'pca_comp_12', 'pca_comp_29', 'SellerID_numTransactions', 'JDPowersCat_numTransactions', 'JDPowersCat_fracReturnedasm1DivTransactions', 'JDPowersCat_SUV', 'pca_comp_56', 'pca_comp_47', 'BuyerID_numDSEligible1', 'SellingLocation_numDSEligible1', 'pca_comp_35', 'Mileage', 'pca_comp_03', 'pca_comp_72', 'pca_comp_27', 'pca_comp_20']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            6020.000000             6020.000000   
mean                                0.054070                0.056146   
std                                 0.224052                0.230943   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                2.000000   

       Returned_asm   pca_comp_66  VIN_fracReturnedasm1DivTransactions  
count   6020.000000  6.020000e+03                          6020.000000  
mean       0.067774  1.362857e-17                             0.068660  
std        0.251379  7.584248e-03                             0.251009  
min        0.000000 -2.202392e-01                             0.000000  
25%        0.000000 -8.749921e-04                             0.000000  
50%        0.000000 -2.065304e-04                             0.000000  
75%        0.000000  5.339752e-04                             0.000000  
max        1.000000  1.736245e-01                             1.000000  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.988
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.983
Chunk:
Evaluation time span: 2013-06-23 00:00:00 2013-06-29 00:00:00
Model return rate:    0.0
Original return rate: 0.203187250996
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-06-29 00:00:00
Model return rate:    0.0890447922288
Original return rate: 0.210649229332
########################################
Timestamp: 2017-03-02T10:08:14GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_numReturnedNotNull', 'VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturned1', 'pca_comp_56', 'VIN_fracReturned1DivReturnedNotNull', 'SellerID_fracReturned1DivReturnedNotNull', 'pca_comp_36', 'pca_comp_61', 'VIN_numDSEligible1', 'BuyerID_numReturnedNotNull', 'pca_comp_44', 'cluster_1_dist', 'pca_comp_00', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_54', 'pca_comp_13', 'SellingLocation_fracReturned1DivReturnedNotNull', 'JDPowersCat_SPORTSCAR', 'pca_comp_07', 'pca_comp_60', 'pca_comp_32', 'BuyerID_numTransactions', 'BuyerID_numDSEligible1', 'pca_comp_43', 'pca_comp_11', 'SellingLocation_fracDSEligible1DivTransactions', 'pca_comp_21', 'SellerID_numReturned1', 'SellingLocation_fracReturnedasm1DivTransactions', 'SellingLocation_lat', 'pca_comp_67', 'VIN_numTransactions', 'pca_cluster_1_dist', 'pca_comp_52', 'pca_comp_47', 'pca_comp_48', 'pca_comp_01', 'BuyerID_fracReturned1DivReturnedNotNull', 'pca_comp_50', 'BuyerID_numReturned1', 'CarMake_fracDSEligible1DivTransactions', 'pca_comp_49', 'JDPowersCat_numReturnedasm1', 'JDPowersCat_numReturnedNotNull', 'pca_comp_05', 'pca_comp_40', 'pca_comp_41', 'pca_comp_38', 'pca_comp_08', 'pca_comp_15', 'pca_comp_02', 'pca_comp_68', 'pca_comp_14', 'pca_comp_27', 'JDPowersCat_SUV', 'pca_comp_51', 'JDPowersCat_numReturned1', 'SellerID_numTransactions', 'pca_comp_20', 'JDPowersCat_fracDSEligible1DivTransactions', 'pca_comp_62', 'pca_comp_65', 'DSEligible', 'SellerID_fracDSEligible1DivTransactions', 'pca_comp_37', 'CarMake_numReturnedasm1', 'pca_cluster_0_dist', 'pca_comp_66', 'SellingLocation_numTransactions', 'BuyerID_fracReturnedasm1DivTransactions', 'JDPowersCat_numTransactions', 'pca_comp_69', 'CarYear', 'JDPowersCat_numDSEligible1', 'pca_comp_35', 'pca_comp_26', 'cluster_0_dist', 'SellerID_fracReturnedasm1DivTransactions', 'SellerID_numDSEligible1', 'pca_comp_55', 'LIGHT_N0G1Y2R3', 'pca_cluster', 'CarMake_fracReturned1DivReturnedNotNull', 'Returned_asm', 'MMR', 'CarMake_fracReturnedasm1DivTransactions', 'SellingLocation_numReturnedasm1', 'pca_comp_45', 'pca_comp_10', 'pca_comp_46', 'Autocheck_score', 'BuyerID_numReturnedasm1', 'JDPowersCat_fracReturnedasm1DivTransactions', 'pca_comp_59', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedasm1', 'pca_comp_33', 'pca_comp_22', 'pca_comp_31', 'LIGHTR', 'pca_comp_23', 'pca_comp_12', 'CarMake_numReturned1', 'JDPowersCat_PICKUP', 'pca_comp_25', 'SaleDate_decyear', 'Mileage', 'pca_comp_53']
Summarize top 5 important features:
       VIN_numReturnedNotNull  VIN_fracReturnedNotNullDivDSEligible1  \
count             5937.000000                            5937.000000   
mean                 0.069564                               0.067065   
std                  0.254432                               0.247722   
min                  0.000000                               0.000000   
25%                  0.000000                               0.000000   
50%                  0.000000                               0.000000   
75%                  0.000000                               0.000000   
max                  1.000000                               1.000000   

       VIN_numReturned1   pca_comp_56  VIN_fracReturned1DivReturnedNotNull  
count       5937.000000  5.937000e+03                          5937.000000  
mean           0.000842 -1.329633e-17                             0.000842  
std            0.029010  4.357245e-02                             0.029010  
min            0.000000 -2.964535e-01                             0.000000  
25%            0.000000 -6.923517e-03                             0.000000  
50%            0.000000 -5.427751e-04                             0.000000  
75%            0.000000  7.572756e-03                             0.000000  
max            1.000000  1.344847e+00                             1.000000  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.990
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.479
Chunk:
Evaluation time span: 2013-06-30 00:00:00 2013-07-06 00:00:00
Model return rate:    0.220708446866
Original return rate: 0.220708446866
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-07-06 00:00:00
Model return rate:    0.110810810811
Original return rate: 0.212121212121
########################################
Timestamp: 2017-03-02T10:08:21GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'Returned_asm', 'VIN_numReturnedNotNull', 'VIN_fracReturned1DivReturnedNotNull', 'VIN_numReturned1', 'VIN_fracReturnedasm1DivTransactions', 'pca_comp_67', 'VIN_numDSEligible1', 'DSEligible', 'pca_comp_00', 'BuyerID_numReturned1', 'pca_comp_64', 'VIN_numTransactions', 'pca_comp_42', 'pca_comp_65', 'pca_comp_63', 'BuyerID_numReturnedNotNull', 'cluster_0_dist', 'pca_comp_49', 'pca_comp_45', 'pca_comp_37', 'pca_comp_72', 'cluster', 'pca_comp_58', 'CarYear', 'pca_cluster', 'pca_comp_01', 'pca_comp_39', 'SellerID_numDSEligible1', 'pca_comp_31', 'pca_comp_35', 'pca_comp_34', 'SellerID_numTransactions', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'pca_cluster_1_dist', 'SellerID_fracReturnedasm1DivTransactions', 'pca_comp_36', 'SellerID_fracDSEligible1DivTransactions', 'pca_comp_20', 'pca_comp_10', 'cluster_1_dist']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  Returned_asm  \
count                            3750.000000   3750.000000   
mean                                0.099956      0.088000   
std                                 0.297239      0.283333   
min                                 0.000000      0.000000   
25%                                 0.000000      0.000000   
50%                                 0.000000      0.000000   
75%                                 0.000000      0.000000   
max                                 1.000000      1.000000   

       VIN_numReturnedNotNull  VIN_fracReturned1DivReturnedNotNull  \
count             3750.000000                          3750.000000   
mean                 0.103733                             0.025067   
std                  0.305828                             0.156348   
min                  0.000000                             0.000000   
25%                  0.000000                             0.000000   
50%                  0.000000                             0.000000   
75%                  0.000000                             0.000000   
max                  2.000000                             1.000000   

       VIN_numReturned1  
count       3750.000000  
mean           0.025333  
std            0.158845  
min            0.000000  
25%            0.000000  
50%            0.000000  
75%            0.000000  
max            2.000000  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.987
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.985
Chunk:
Evaluation time span: 2013-07-07 00:00:00 2013-07-13 00:00:00
Model return rate:    0.0
Original return rate: 0.2
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-07-13 00:00:00
Model return rate:    0.0919282511211
Original return rate: 0.20987654321
########################################
Timestamp: 2017-03-02T10:08:28GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_numReturnedNotNull', 'VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numDSEligible1', 'pca_comp_45', 'SellerID_fracReturned1DivReturnedNotNull', 'pca_comp_44', 'Mileage', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_40', 'SellerID_fracReturnedNotNullDivDSEligible1', 'BuyerID_numReturnedNotNull', 'CarMake_numReturned1', 'pca_comp_38', 'pca_comp_53', 'CarMake_numReturnedNotNull', 'pca_comp_31', 'pca_comp_52', 'cluster_1_dist', 'pca_comp_39', 'CarMake_fracDSEligible1DivTransactions', 'cluster_0_dist', 'pca_comp_56', 'CarMake_fracReturnedasm1DivTransactions', 'pca_comp_22', 'CarMake_numDSEligible1', 'pca_cluster_0_dist', 'JDPowersCat_PICKUP', 'pca_comp_16', 'pca_comp_28', 'pca_cluster_1_dist', 'SellerID_fracReturnedasm1DivTransactions', 'CarMake_numTransactions', 'CarMake_numReturnedasm1', 'pca_comp_34', 'SellerID_fracDSEligible1DivTransactions', 'SellerID_numDSEligible1', 'pca_comp_59', 'pca_comp_19', 'SellerID_numReturnedNotNull', 'pca_comp_14', 'SellerID_numTransactions', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_33']
Summarize top 5 important features:
       VIN_numReturnedNotNull  VIN_fracReturnedNotNullDivDSEligible1  \
count             6574.000000                            6574.000000   
mean                 0.071342                               0.069364   
std                  0.257414                               0.252138   
min                  0.000000                               0.000000   
25%                  0.000000                               0.000000   
50%                  0.000000                               0.000000   
75%                  0.000000                               0.000000   
max                  1.000000                               1.000000   

       VIN_numDSEligible1   pca_comp_45  \
count         6574.000000  6.574000e+03   
mean             0.968969  9.178571e-17   
std              0.294911  1.240888e-01   
min              0.000000 -1.320551e+00   
25%              1.000000 -4.878065e-02   
50%              1.000000 -1.317990e-02   
75%              1.000000  1.669662e-02   
max              4.000000  2.122892e+00   

       SellerID_fracReturned1DivReturnedNotNull  
count                               6574.000000  
mean                                   0.047432  
std                                    0.128853  
min                                    0.000000  
25%                                    0.000000  
50%                                    0.000000  
75%                                    0.000000  
max                                    1.000000  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.993
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.802
Chunk:
Evaluation time span: 2013-07-14 00:00:00 2013-07-20 00:00:00
Model return rate:    0.244698205546
Original return rate: 0.244698205546
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-07-20 00:00:00
Model return rate:    0.120401337793
Original return rate: 0.215659712815
########################################
Timestamp: 2017-03-02T10:08:36GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'Returned_asm', 'VIN_fracReturned1DivReturnedNotNull', 'VIN_numReturned1', 'pca_comp_65', 'pca_cluster_1_dist', 'pca_comp_66', 'pca_comp_01', 'VIN_numDSEligible1', 'pca_comp_64', 'pca_cluster_0_dist', 'cluster_0_dist', 'BuyerID_numReturnedNotNull', 'VIN_numTransactions', 'pca_comp_59', 'cluster_1_dist', 'pca_comp_31', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'BuyerID_numReturned1', 'pca_comp_63', 'pca_comp_09', 'Mileage', 'cluster', 'VIN_fracDSEligible1DivTransactions', 'pca_comp_55', 'pca_comp_56', 'pca_comp_52', 'pca_comp_00', 'pca_comp_36', 'SellerID_numDSEligible1', 'Arbitrated', 'pca_comp_12', 'pca_comp_33', 'pca_comp_05', 'pca_comp_08', 'pca_comp_51', 'MMR', 'Autocheck_score', 'pca_comp_42', 'SellerID_numTransactions', 'CarYear', 'pca_comp_37', 'SellerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_16', 'pca_comp_46', 'SellerID_fracDSEligible1DivTransactions', 'pca_comp_47', 'pca_comp_24', 'JDPowersCat_numReturnedNotNull', 'pca_comp_53', 'BuyerID_numReturnedasm1', 'SellerID_numReturnedasm1', 'BuyerID_fracReturned1DivReturnedNotNull', 'pca_comp_38', 'pca_comp_07', 'PSI', 'pca_comp_70', 'VIN_fracReturnedasm1DivTransactions', 'SalePrice', 'BuyerID_numTransactions', 'CarMake_fracReturnedNotNullDivDSEligible1', 'pca_comp_67', 'pca_comp_54', 'pca_comp_04', 'pca_comp_49']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            6189.000000             6189.000000   
mean                                0.098872                0.104217   
std                                 0.295212                0.309769   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                2.000000   

       Returned_asm  VIN_fracReturned1DivReturnedNotNull  VIN_numReturned1  
count   6189.000000                          6189.000000       6189.000000  
mean       0.064631                             0.025852          0.026499  
std        0.245893                             0.158453          0.163616  
min        0.000000                             0.000000          0.000000  
25%        0.000000                             0.000000          0.000000  
50%        0.000000                             0.000000          0.000000  
75%        0.000000                             0.000000          0.000000  
max        1.000000                             1.000000          2.000000  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.996
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.986
Chunk:
Evaluation time span: 2013-07-21 00:00:00 2013-07-27 00:00:00
Model return rate:    0.0
Original return rate: 0.269097222222
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-07-27 00:00:00
Model return rate:    0.106882591093
Original return rate: 0.22287321303
########################################
Timestamp: 2017-03-02T10:08:43GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'VIN_numReturnedasm1', 'VIN_fracReturnedasm1DivTransactions', 'pca_comp_60', 'pca_comp_52', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_36', 'VIN_numReturned1', 'pca_comp_53', 'Returned_asm', 'pca_comp_37', 'SellingLocation_lon', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'DSEligible', 'pca_comp_01', 'VIN_numTransactions', 'SellerID_fracReturned1DivReturnedNotNull', 'BuyerID_numReturnedNotNull', 'pca_comp_66', 'BuyerID_numDSEligible1', 'pca_comp_50', 'SellerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_56', 'pca_comp_05', 'SellerID_fracReturnedasm1DivTransactions', 'SellingLocation_fracReturned1DivReturnedNotNull', 'cluster_1_dist', 'pca_comp_34', 'BuyerID_numTransactions', 'SellingLocation_fracDSEligible1DivTransactions', 'pca_comp_00', 'pca_comp_48', 'CarMake_fracReturnedNotNullDivDSEligible1', 'SellingLocation_fracReturnedasm1DivTransactions', 'pca_comp_16', 'pca_comp_11', 'pca_comp_40', 'pca_comp_06', 'CarYear', 'pca_cluster_0_dist', 'pca_comp_29', 'pca_comp_57', 'SellingLocation_lat', 'pca_comp_38', 'cluster_0_dist', 'BuyerID_fracReturned1DivReturnedNotNull', 'SellingLocation_numTransactions', 'JDPowersCat_VAN', 'pca_comp_39', 'pca_comp_07', 'pca_comp_28', 'pca_comp_24', 'pca_comp_58', 'pca_comp_42', 'BuyerID_fracDSEligible1DivTransactions', 'pca_comp_63', 'pca_comp_09', 'pca_comp_31', 'SellingLocation_numReturnedasm1', 'BuyerID_numReturnedasm1', 'pca_comp_10', 'pca_comp_68', 'LIGHT_N0G1Y2R3', 'pca_comp_33', 'pca_comp_21', 'pca_comp_49', 'pca_comp_23', 'SellingLocation_numReturnedNotNull', 'LIGHTR', 'pca_comp_27', 'pca_comp_03', 'SalePrice', 'Arbitrated', 'pca_comp_45', 'pca_comp_46', 'SellerID_fracDSEligible1DivTransactions', 'JDPowersCat_numReturnedNotNull', 'JDPowersCat_numDSEligible1', 'pca_comp_54', 'pca_comp_15', 'BuyerID_fracReturnedasm1DivTransactions', 'pca_comp_32', 'pca_comp_18', 'VIN_numDSEligible1', 'pca_comp_44', 'pca_comp_59', 'SellerID_numReturnedasm1', 'pca_comp_51', 'Autocheck_score', 'MMR', 'pca_comp_47', 'SellingLocation_numDSEligible1', 'pca_comp_41', 'pca_comp_04', 'Mileage', 'pca_comp_13', 'pca_comp_64', 'pca_comp_25', 'pca_comp_65', 'SellingLocation_numReturned1', 'CarMake_fracReturned1DivReturnedNotNull', 'pca_comp_61', 'pca_comp_55', 'JDPowersCat_numReturned1', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'SellerID_numReturned1', 'pca_comp_69', 'JDPowersCat_COMPACTCAR', 'VIN_fracDSEligible1DivTransactions']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            5786.000000             5786.000000   
mean                                0.072531                0.076218   
std                                 0.256053                0.266021   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                2.000000   

       VIN_numReturnedasm1  VIN_fracReturnedasm1DivTransactions   pca_comp_60  
count          5786.000000                          5786.000000  5.786000e+03  
mean              0.095230                             0.087885 -1.499662e-17  
std               0.300541                             0.278578  2.409827e-02  
min               0.000000                             0.000000 -5.944825e-01  
25%               0.000000                             0.000000 -2.675429e-03  
50%               0.000000                             0.000000 -1.020192e-03  
75%               0.000000                             0.000000  9.534016e-04  
max               2.000000                             1.000000  2.659562e-01  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.994
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.362
Chunk:
Evaluation time span: 2013-07-28 00:00:00 2013-08-03 00:00:00
Model return rate:    0.256944444444
Original return rate: 0.256944444444
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-08-03 00:00:00
Model return rate:    0.127073113758
Original return rate: 0.226925459426
########################################
Timestamp: 2017-03-02T10:08:51GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'Returned_asm', 'VIN_fracReturned1DivReturnedNotNull', 'VIN_fracReturnedasm1DivTransactions', 'pca_comp_67', 'pca_comp_62', 'pca_comp_66', 'pca_comp_10', 'BuyerID_numReturnedNotNull', 'pca_comp_01', 'SellerID_fracReturned1DivReturnedNotNull', 'VIN_numReturned1', 'pca_comp_04', 'VIN_numDSEligible1', 'SellerID_numReturnedasm1', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_64', 'pca_comp_58', 'pca_comp_09', 'Arbitrated', 'pca_cluster_0_dist', 'pca_comp_46', 'BuyerID_numReturned1', 'cluster_0_dist', 'pca_comp_29', 'pca_comp_61', 'pca_comp_25', 'BuyerID_fracReturned1DivReturnedNotNull', 'VIN_numReturnedasm1', 'VIN_fracDSEligible1DivTransactions', 'pca_comp_40', 'pca_comp_03', 'SalePrice', 'pca_comp_60', 'pca_comp_12', 'VIN_numTransactions', 'pca_comp_63', 'pca_comp_48', 'pca_comp_19', 'pca_comp_52', 'JDPowersCat_SPORTSCAR', 'pca_comp_06', 'MMR', 'SellerID_fracDSEligible1DivTransactions', 'pca_comp_33', 'pca_comp_53', 'JDPowersCat_numTransactions', 'pca_comp_34', 'pca_comp_08', 'pca_comp_30', 'pca_comp_56', 'JDPowersCat_numReturnedasm1', 'pca_comp_44', 'pca_comp_45', 'JDPowersCat_numReturnedNotNull', 'BuyerID_numDSEligible1', 'SellerID_numDSEligible1', 'SellerID_numTransactions', 'pca_cluster_1_dist', 'cluster_1_dist', 'CarMake_fracDSEligible1DivTransactions', 'SellerID_numReturned1', 'LIGHTR', 'pca_comp_13', 'SellerID_fracReturnedasm1DivTransactions', 'pca_comp_54', 'Mileage', 'pca_comp_26', 'pca_comp_69', 'pca_comp_02', 'pca_comp_00', 'pca_comp_43', 'LIGHT_N0G1Y2R3', 'JDPowersCat_numDSEligible1']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            5913.000000             5913.000000   
mean                                0.098512                0.104854   
std                                 0.293687                0.310231   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                2.000000   

       Returned_asm  VIN_fracReturned1DivReturnedNotNull  \
count   5913.000000                          5913.000000   
mean       0.073398                             0.029427   
std        0.260810                             0.168512   
min        0.000000                             0.000000   
25%        0.000000                             0.000000   
50%        0.000000                             0.000000   
75%        0.000000                             0.000000   
max        1.000000                             1.000000   

       VIN_fracReturnedasm1DivTransactions  
count                          5913.000000  
mean                              0.077879  
std                               0.261841  
min                               0.000000  
25%                               0.000000  
50%                               0.000000  
75%                               0.000000  
max                               1.000000  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.997
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.995
Chunk:
Evaluation time span: 2013-08-04 00:00:00 2013-08-10 00:00:00
Model return rate:    0.0
Original return rate: 0.252727272727
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-08-10 00:00:00
Model return rate:    0.115942028986
Original return rate: 0.229556832932
########################################
Timestamp: 2017-03-02T10:08:59GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'VIN_numReturnedasm1', 'pca_comp_53', 'VIN_fracReturned1DivReturnedNotNull', 'VIN_fracReturnedasm1DivTransactions', 'pca_comp_52', 'SellerID_numReturnedasm1', 'pca_comp_68', 'pca_comp_01', 'pca_comp_60', 'BuyerID_numReturnedNotNull', 'SellerID_numReturned1', 'pca_comp_38', 'pca_comp_58', 'Returned_asm', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'SellerID_fracReturned1DivReturnedNotNull', 'VIN_numTransactions', 'VIN_fracDSEligible1DivTransactions', 'pca_comp_65', 'cluster', 'DSEligible', 'PSIEligible', 'VIN_numReturned1', 'Autocheck_score', 'LIGHT_N0G1Y2R3', 'pca_comp_25', 'pca_comp_15', 'pca_comp_63', 'LIGHTR', 'pca_comp_11', 'SellerID_fracDSEligible1DivTransactions', 'BuyerID_numReturned1', 'pca_comp_23', 'pca_comp_16', 'MMR', 'pca_comp_07', 'pca_comp_20', 'pca_comp_69', 'pca_cluster_1_dist', 'pca_comp_41', 'pca_comp_22', 'pca_comp_37', 'pca_comp_27', 'pca_comp_17', 'pca_comp_66', 'pca_comp_47', 'pca_comp_10', 'pca_comp_05', 'SellerID_numTransactions', 'Mileage', 'cluster_0_dist', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'CarMake_fracDSEligible1DivTransactions', 'BuyerID_fracReturned1DivReturnedNotNull']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            6161.000000              6161.00000   
mean                                0.068035                 0.07304   
std                                 0.247775                 0.26394   
min                                 0.000000                 0.00000   
25%                                 0.000000                 0.00000   
50%                                 0.000000                 0.00000   
75%                                 0.000000                 0.00000   
max                                 1.000000                 2.00000   

       VIN_numReturnedasm1   pca_comp_53  VIN_fracReturned1DivReturnedNotNull  
count          6161.000000  6.161000e+03                          6161.000000  
mean              0.088297 -1.413570e-17                             0.003814  
std               0.288290  7.290043e-02                             0.060652  
min               0.000000 -4.557946e-01                             0.000000  
25%               0.000000 -9.764019e-03                             0.000000  
50%               0.000000 -2.045005e-03                             0.000000  
75%               0.000000  6.489227e-03                             0.000000  
max               3.000000  1.263895e+00                             1.000000  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.994
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.584
Chunk:
Evaluation time span: 2013-08-11 00:00:00 2013-08-17 00:00:00
Model return rate:    0.230529595016
Original return rate: 0.230529595016
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-08-17 00:00:00
Model return rate:    0.129733783277
Original return rate: 0.22966031483
########################################
Timestamp: 2017-03-02T10:09:07GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'Returned_asm', 'VIN_numReturnedNotNull', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_66', 'VIN_fracReturnedasm1DivTransactions', 'SellerID_numReturnedasm1', 'pca_comp_61', 'VIN_numTransactions', 'pca_comp_01', 'BuyerID_numReturnedNotNull', 'pca_comp_33', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'VIN_numDSEligible1', 'pca_comp_34', 'pca_comp_56', 'BuyerID_numReturned1', 'pca_comp_12', 'pca_cluster_1_dist', 'Arbitrated', 'SellerID_fracReturned1DivReturnedNotNull', 'SellerID_numDSEligible1', 'cluster_1_dist', 'VIN_numReturned1', 'pca_comp_41', 'pca_comp_02', 'pca_comp_62', 'SellerID_numTransactions', 'pca_comp_19', 'pca_comp_04', 'pca_comp_00', 'SellerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_37', 'pca_comp_21', 'pca_comp_47', 'pca_comp_65', 'pca_comp_63', 'pca_comp_60', 'pca_comp_38', 'pca_comp_51', 'VIN_numReturnedasm1', 'pca_comp_52', 'BuyerID_fracReturnedasm1DivTransactions', 'SellerID_numReturned1', 'LIGHTR', 'JDPowersCat_fracReturnedNotNullDivDSEligible1', 'CarMake_fracReturnedNotNullDivDSEligible1', 'pca_comp_27', 'pca_comp_48', 'pca_comp_67', 'BuyerID_fracDSEligible1DivTransactions', 'pca_comp_32', 'pca_comp_07', 'CarYear', 'cluster_0_dist', 'pca_comp_70', 'pca_comp_11', 'pca_comp_43', 'pca_comp_08', 'pca_comp_58', 'pca_comp_72', 'BuyerID_fracReturned1DivReturnedNotNull', 'SellerID_fracDSEligible1DivTransactions', 'SellerID_fracReturnedasm1DivTransactions', 'JDPowersCat_numReturnedNotNull', 'pca_comp_64', 'MMR', 'pca_comp_15', 'pca_comp_10', 'pca_comp_03', 'pca_comp_39', 'SellingLocation_fracDSEligible1DivTransactions', 'pca_comp_24', 'Autocheck_score', 'pca_comp_45', 'pca_comp_06', 'pca_comp_55', 'pca_comp_23', 'SaleDate_decyear', 'BuyerID_numDSEligible1', 'pca_comp_53', 'pca_comp_59', 'pca_comp_28', 'Mileage', 'JDPowersCat_numTransactions', 'pca_comp_30', 'pca_comp_22', 'CarMake_fracReturnedasm1DivTransactions', 'CarMake_fracDSEligible1DivTransactions']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  Returned_asm  \
count                            6503.000000   6503.000000   
mean                                0.100351      0.078887   
std                                 0.296334      0.269583   
min                                 0.000000      0.000000   
25%                                 0.000000      0.000000   
50%                                 0.000000      0.000000   
75%                                 0.000000      0.000000   
max                                 1.000000      1.000000   

       VIN_numReturnedNotNull  VIN_fracReturned1DivReturnedNotNull  \
count             6503.000000                          6503.000000   
mean                 0.106874                             0.027141   
std                  0.313424                             0.162152   
min                  0.000000                             0.000000   
25%                  0.000000                             0.000000   
50%                  0.000000                             0.000000   
75%                  0.000000                             0.000000   
max                  2.000000                             1.000000   

        pca_comp_66  
count  6.503000e+03  
mean  -2.052982e-18  
std    9.090636e-03  
min   -2.504469e-01  
25%   -7.261761e-04  
50%    6.435760e-05  
75%    7.087362e-04  
max    1.260821e-01  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.997
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.982
Chunk:
Evaluation time span: 2013-08-18 00:00:00 2013-08-24 00:00:00
Model return rate:    0.0
Original return rate: 0.184568835098
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-08-24 00:00:00
Model return rate:    0.117827345479
Original return rate: 0.225209080048
########################################
Timestamp: 2017-03-02T10:09:15GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_60', 'pca_comp_35', 'pca_comp_49', 'pca_comp_34', 'DSEligible', 'pca_cluster_0_dist', 'Returned_asm', 'cluster_0_dist', 'pca_comp_58', 'pca_comp_63', 'SellingLocation_lon', 'VIN_numReturned1', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_39', 'SellerID_fracReturned1DivReturnedNotNull', 'pca_comp_11', 'pca_comp_66', 'VIN_numReturnedasm1', 'BuyerID_numReturnedNotNull', 'pca_comp_37', 'pca_comp_13', 'VIN_fracReturnedasm1DivTransactions', 'pca_comp_02', 'SellingLocation_lat', 'pca_comp_14', 'VIN_fracDSEligible1DivTransactions', 'PSIEligible', 'BuyerID_numReturned1', 'pca_comp_09', 'pca_comp_64', 'pca_comp_29', 'CarYear', 'pca_comp_19', 'SellerID_fracReturnedNotNullDivDSEligible1', 'BuyerID_numReturnedasm1', 'SellingLocation_numTransactions', 'pca_comp_47', 'pca_comp_52', 'pca_comp_55', 'pca_comp_45', 'SellingLocation_numDSEligible1', 'pca_comp_08', 'pca_comp_27', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'CarMake_fracDSEligible1DivTransactions', 'pca_comp_10', 'VIN_numTransactions', 'pca_comp_18', 'SellerID_numReturnedasm1', 'SellingLocation_numReturnedasm1', 'pca_comp_61', 'pca_comp_01', 'pca_comp_28', 'SellerID_fracReturnedasm1DivTransactions', 'SalePrice', 'CarMake_fracReturnedNotNullDivDSEligible1', 'pca_comp_05', 'SellerID_fracDSEligible1DivTransactions', 'Mileage', 'VIN_numDSEligible1', 'pca_comp_24', 'pca_comp_36', 'SellingLocation_numReturnedNotNull']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            6216.000000             6216.000000   
mean                                0.089125                0.095560   
std                                 0.280361                0.297817   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                2.000000   

       VIN_fracReturned1DivReturnedNotNull   pca_comp_60   pca_comp_35  
count                          6216.000000  6.216000e+03  6.216000e+03  
mean                              0.006113  7.557322e-19 -2.120962e-17  
std                               0.076391  2.712373e-02  3.110510e-01  
min                               0.000000 -5.443008e-01 -1.538647e+00  
25%                               0.000000 -3.803259e-03 -1.799287e-01  
50%                               0.000000 -1.545376e-03 -2.196175e-02  
75%                               0.000000  1.652658e-03  1.388972e-01  
max                               1.000000  2.317003e-01  2.139217e+00  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.994
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.562
Chunk:
Evaluation time span: 2013-08-25 00:00:00 2013-08-31 00:00:00
Model return rate:    0.225543478261
Original return rate: 0.225543478261
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-08-31 00:00:00
Model return rate:    0.129822968679
Original return rate: 0.22524219591
########################################
Timestamp: 2017-03-02T10:09:23GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'Returned_asm', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_66', 'pca_comp_61', 'VIN_numReturned1', 'DSEligible', 'pca_comp_64', 'pca_comp_62', 'pca_comp_70', 'pca_comp_46', 'pca_comp_39', 'pca_comp_36', 'pca_comp_53', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_71', 'pca_comp_32', 'pca_comp_59', 'VIN_fracDSEligible1DivTransactions', 'pca_comp_42', 'CarMake_numReturned1', 'pca_comp_44', 'pca_comp_50', 'pca_comp_56', 'BuyerID_numTransactions', 'pca_comp_11', 'BuyerID_numReturnedasm1', 'VIN_numReturnedasm1', 'pca_comp_72', 'CarMake_numReturnedasm1', 'pca_comp_49', 'pca_comp_37', 'SellerID_fracDSEligible1DivTransactions', 'pca_comp_35', 'SellerID_fracReturnedasm1DivTransactions', 'pca_comp_02', 'pca_comp_47', 'pca_comp_45', 'BuyerID_fracDSEligible1DivTransactions', 'CarMake_numReturnedNotNull', 'JDPowersCat_numDSEligible1', 'CarMake_numTransactions', 'SellingLocation_lon', 'SellerID_fracReturned1DivReturnedNotNull', 'cluster_1_dist', 'pca_comp_51', 'BuyerID_numDSEligible1', 'pca_comp_60', 'SellingLocation_fracReturned1DivReturnedNotNull', 'pca_comp_27', 'JDPowersCat_fracReturnedNotNullDivDSEligible1', 'pca_comp_58', 'JDPowersCat_numReturned1', 'pca_comp_29', 'CarMake_fracDSEligible1DivTransactions', 'pca_comp_34', 'SellingLocation_numReturnedasm1', 'CarMake_numDSEligible1', 'SellingLocation_lat', 'SellingLocation_numReturned1', 'pca_comp_04', 'pca_comp_14', 'pca_comp_43', 'SalePrice', 'pca_comp_38', 'Mileage', 'JDPowersCat_numReturnedNotNull', 'JDPowersCat_numTransactions', 'JDPowersCat_fracReturned1DivReturnedNotNull', 'pca_comp_03', 'pca_comp_09', 'VIN_fracReturnedasm1DivTransactions', 'pca_comp_01', 'SellingLocation_numReturnedNotNull', 'pca_comp_57', 'MMR', 'pca_comp_69', 'pca_comp_17', 'pca_comp_28', 'pca_cluster_0_dist', 'BuyerID_fracReturned1DivReturnedNotNull', 'pca_comp_33', 'pca_comp_63', 'BuyerID_fracReturnedasm1DivTransactions', 'SellerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_22', 'JDPowersCat_fracDSEligible1DivTransactions', 'SellerID_numReturned1', 'pca_comp_07', 'JDPowersCat_LUXURYCAR', 'pca_comp_05', 'pca_comp_19', 'JDPowersCat_PICKUP', 'pca_comp_41', 'pca_comp_67', 'SellerID_numReturnedNotNull', 'SaleDate_dow', 'SellingLocation_numDSEligible1', 'JDPowersCat_numReturnedasm1', 'pca_comp_15', 'cluster', 'CarMake_fracReturnedNotNullDivDSEligible1', 'pca_comp_55', 'CarMake_fracReturned1DivReturnedNotNull', 'pca_comp_24', 'pca_comp_20', 'pca_comp_21', 'CarMake_fracReturnedasm1DivTransactions', 'BuyerID_numReturnedNotNull', 'pca_comp_06', 'pca_comp_31', 'pca_comp_54', 'pca_comp_23', 'pca_comp_18', 'SellingLocation_fracDSEligible1DivTransactions', 'VIN_numTransactions', 'pca_comp_65', 'pca_comp_52', 'CarYear', 'pca_comp_08', 'SellerID_numReturnedasm1', 'SellerID_numTransactions', 'BuyerID_numReturned1', 'pca_comp_26', 'JDPowersCat_fracReturnedasm1DivTransactions', 'pca_comp_68']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            6331.000000             6331.000000   
mean                                0.116990                0.125888   
std                                 0.316842                0.340677   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                2.000000   

       Returned_asm  VIN_fracReturned1DivReturnedNotNull   pca_comp_66  
count   6331.000000                          6331.000000  6.331000e+03  
mean       0.091771                             0.031906 -1.117285e-17  
std        0.288725                             0.175090  9.898213e-03  
min        0.000000                             0.000000 -1.837539e-01  
25%        0.000000                             0.000000 -8.687123e-04  
50%        0.000000                             0.000000 -2.442378e-05  
75%        0.000000                             0.000000  6.976617e-04  
max        1.000000                             1.000000  2.311562e-01  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.990
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.994
Chunk:
Evaluation time span: 2013-09-01 00:00:00 2013-09-07 00:00:00
Model return rate:    0.0
Original return rate: 0.193602693603
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-09-07 00:00:00
Model return rate:    0.1210496614
Original return rate: 0.222900573137
########################################
Timestamp: 2017-03-02T10:09:32GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'VIN_fracReturned1DivReturnedNotNull', 'VIN_numReturned1', 'pca_comp_51', 'pca_comp_13', 'pca_comp_59', 'pca_comp_26', 'Returned_asm', 'pca_comp_66', 'DSEligible', 'SellerID_fracReturned1DivReturnedNotNull', 'pca_comp_27', 'pca_comp_58', 'pca_comp_57', 'pca_comp_15', 'cluster_1_dist', 'pca_comp_53', 'pca_comp_52', 'pca_comp_65', 'pca_cluster', 'pca_comp_41', 'pca_cluster_0_dist', 'pca_comp_47', 'BuyerID_numDSEligible1', 'pca_cluster_1_dist', 'cluster', 'BuyerID_numTransactions', 'SellingLocation_fracReturnedasm1DivTransactions', 'VIN_numTransactions', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_23', 'JDPowersCat_numReturnedasm1', 'pca_comp_08', 'pca_comp_56', 'pca_comp_11', 'JDPowersCat_fracReturnedNotNullDivDSEligible1', 'pca_comp_48', 'SellingLocation_fracDSEligible1DivTransactions', 'SellerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_40', 'pca_comp_67', 'pca_comp_09']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            5026.000000             5026.000000   
mean                                0.096896                0.102268   
std                                 0.292860                0.308240   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                2.000000   

       VIN_fracReturned1DivReturnedNotNull  VIN_numReturned1   pca_comp_51  
count                          5026.000000       5026.000000  5.026000e+03  
mean                              0.003880          0.003979  7.573970e-18  
std                               0.061772          0.062962  9.247145e-02  
min                               0.000000          0.000000 -4.871721e-01  
25%                               0.000000          0.000000 -1.715146e-02  
50%                               0.000000          0.000000  3.905734e-03  
75%                               0.000000          0.000000  2.302979e-02  
max                               1.000000          1.000000  1.228385e+00  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.993
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.404
Chunk:
Evaluation time span: 2013-09-08 00:00:00 2013-09-14 00:00:00
Model return rate:    0.221621621622
Original return rate: 0.221621621622
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-09-14 00:00:00
Model return rate:    0.130556974962
Original return rate: 0.222792607803
########################################
Timestamp: 2017-03-02T10:09:40GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'Returned_asm', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_64', 'pca_comp_60', 'pca_comp_66', 'VIN_numReturned1', 'pca_comp_61', 'pca_comp_47', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_11', 'VIN_numTransactions', 'VIN_fracReturnedasm1DivTransactions', 'pca_comp_55', 'Arbitrated', 'pca_comp_30', 'VIN_numReturnedasm1', 'pca_comp_54', 'pca_comp_00', 'pca_comp_48', 'pca_comp_52', 'pca_comp_28', 'pca_comp_26', 'pca_cluster_0_dist', 'SellerID_fracReturned1DivReturnedNotNull', 'pca_comp_24', 'pca_comp_18', 'BuyerID_numDSEligible1', 'VIN_numDSEligible1', 'pca_comp_41', 'pca_comp_14', 'pca_comp_56', 'CarMake_fracReturnedNotNullDivDSEligible1', 'cluster_1_dist', 'BuyerID_numTransactions', 'pca_comp_38', 'pca_comp_37', 'pca_comp_32', 'pca_comp_33', 'SellerID_numReturnedasm1', 'pca_comp_04', 'pca_comp_23', 'pca_comp_34', 'pca_comp_70', 'BuyerID_numReturned1', 'SellingLocation_fracReturned1DivReturnedNotNull', 'VIN_fracDSEligible1DivTransactions', 'pca_comp_68', 'SellerID_fracDSEligible1DivTransactions', 'pca_comp_19', 'BuyerID_numReturnedNotNull', 'pca_comp_31', 'pca_comp_06', 'pca_cluster_1_dist', 'pca_comp_15', 'SellerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_63', 'pca_comp_71', 'pca_comp_07', 'pca_comp_05', 'pca_comp_44', 'pca_comp_67', 'pca_comp_09', 'SellingLocation_numTransactions', 'pca_comp_29', 'SellingLocation_lat', 'cluster_0_dist', 'pca_comp_59', 'SellingLocation_numReturned1', 'pca_comp_27', 'JDPowersCat_SUV', 'SellerID_fracReturnedasm1DivTransactions', 'SellingLocation_fracReturnedasm1DivTransactions', 'pca_comp_46', 'CarMake_numDSEligible1', 'cluster', 'JDPowersCat_numReturnedNotNull', 'SellerID_numReturned1', 'pca_comp_39', 'CarMake_numTransactions', 'JDPowersCat_fracReturnedasm1DivTransactions', 'pca_comp_50', 'JDPowersCat_fracReturnedNotNullDivDSEligible1', 'BuyerID_fracReturned1DivReturnedNotNull', 'JDPowersCat_fracDSEligible1DivTransactions', 'pca_comp_22', 'pca_comp_49', 'pca_comp_62', 'BuyerID_fracDSEligible1DivTransactions', 'pca_comp_40', 'JDPowersCat_numReturned1', 'pca_comp_13', 'pca_comp_03', 'SellingLocation_numReturnedNotNull', 'pca_comp_25', 'pca_comp_58', 'JDPowersCat_PICKUP', 'LIGHTR', 'pca_comp_57', 'SellerID_numReturnedNotNull', 'pca_comp_51', 'pca_comp_43', 'JDPowersCat_numDSEligible1', 'SellerID_numDSEligible1', 'SaleDate_doy', 'pca_comp_69', 'pca_comp_17', 'pca_comp_53', 'BuyerID_numReturnedasm1', 'CarMake_numReturned1', 'MMR', 'pca_comp_01', 'DSEligible', 'SalePrice', 'Autocheck_score', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'pca_comp_10', 'pca_comp_42', 'pca_comp_08', 'JDPowersCat_numReturnedasm1', 'pca_comp_65', 'SellingLocation_fracDSEligible1DivTransactions', 'pca_comp_16', 'pca_comp_02', 'CarMake_fracReturned1DivReturnedNotNull', 'pca_comp_36', 'CarYear', 'JDPowersCat_numTransactions', 'CarMake_numReturnedNotNull']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            6598.000000             6598.000000   
mean                                0.112029                0.120491   
std                                 0.310834                0.332926   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                2.000000   

       Returned_asm  VIN_fracReturned1DivReturnedNotNull   pca_comp_64  
count   6598.000000                          6598.000000  6.598000e+03  
mean       0.086693                             0.029479  9.725777e-19  
std        0.281406                             0.168371  1.475371e-02  
min        0.000000                             0.000000 -5.620931e-01  
25%        0.000000                             0.000000 -7.779020e-04  
50%        0.000000                             0.000000  1.325380e-04  
75%        0.000000                             0.000000  9.685702e-04  
max        1.000000                             1.000000  1.351816e-01  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.997
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.993
Chunk:
Evaluation time span: 2013-09-15 00:00:00 2013-09-21 00:00:00
Model return rate:    0.0
Original return rate: 0.235378031384
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-09-21 00:00:00
Model return rate:    0.12219033955
Original return rate: 0.223724516742
########################################
Timestamp: 2017-03-02T10:09:49GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'VIN_fracReturned1DivReturnedNotNull', 'VIN_numReturned1', 'pca_comp_50', 'pca_comp_35', 'pca_comp_58', 'DSEligible', 'pca_comp_59', 'Returned_asm', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_53', 'pca_comp_62', 'VIN_numDSEligible1', 'BuyerID_numReturnedNotNull', 'SellerID_fracDSEligible1DivTransactions', 'pca_cluster_1_dist', 'pca_comp_48', 'pca_comp_00', 'BuyerID_numDSEligible1', 'pca_comp_66', 'BuyerID_numTransactions', 'pca_comp_02', 'pca_comp_65', 'pca_comp_40', 'SaleDate_day', 'VIN_fracDSEligible1DivTransactions', 'SellerID_fracReturnedasm1DivTransactions', 'SellerID_numReturnedNotNull', 'cluster_1_dist', 'pca_comp_21', 'SellerID_numTransactions', 'pca_cluster_0_dist', 'pca_comp_10', 'pca_comp_12', 'pca_comp_63', 'pca_comp_60', 'pca_comp_37', 'SaleDate_doy', 'SaleDate_decyear', 'pca_comp_30', 'BuyerID_fracDSEligible1DivTransactions', 'pca_comp_01', 'cluster', 'VIN_numReturnedasm1', 'BuyerID_fracReturned1DivReturnedNotNull', 'SellerID_numDSEligible1', 'cluster_0_dist', 'pca_comp_51', 'pca_comp_34', 'pca_comp_25', 'Mileage']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            6112.000000             6112.000000   
mean                                0.089469                0.095223   
std                                 0.281728                0.297423   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                2.000000   

       VIN_fracReturned1DivReturnedNotNull  VIN_numReturned1   pca_comp_50  
count                          6112.000000       6112.000000  6.112000e+03  
mean                              0.004745          0.005236 -4.302395e-19  
std                               0.067523          0.074406  9.815489e-02  
min                               0.000000          0.000000 -6.552332e-01  
25%                               0.000000          0.000000 -2.470966e-02  
50%                               0.000000          0.000000  3.683626e-03  
75%                               0.000000          0.000000  3.089828e-02  
max                               1.000000          2.000000  1.511555e+00  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.993
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.497
Chunk:
Evaluation time span: 2013-09-22 00:00:00 2013-09-28 00:00:00
Model return rate:    0.198550724638
Original return rate: 0.198550724638
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-09-28 00:00:00
Model return rate:    0.128009719461
Original return rate: 0.222014374323
########################################
Timestamp: 2017-03-02T10:09:58GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'Returned_asm', 'VIN_numReturnedNotNull', 'pca_comp_66', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_64', 'SellerID_numReturned1', 'SellingLocation_lat', 'pca_comp_60', 'VIN_numReturned1', 'SellingLocation_lon', 'pca_comp_00', 'pca_comp_61', 'SellerID_numReturnedasm1', 'BuyerID_numReturnedNotNull', 'pca_comp_25', 'pca_comp_38', 'pca_comp_31', 'VIN_fracReturnedasm1DivTransactions', 'VIN_numTransactions', 'pca_comp_01', 'BuyerID_fracReturned1DivReturnedNotNull', 'cluster', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_14', 'VIN_numDSEligible1', 'pca_comp_40', 'VIN_numReturnedasm1', 'pca_comp_51', 'SellingLocation_numReturnedasm1', 'SellerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_52', 'pca_comp_02', 'pca_comp_59', 'pca_cluster_0_dist', 'pca_comp_33', 'SellingLocation_numReturned1', 'cluster_0_dist', 'pca_comp_06', 'SellerID_fracReturned1DivReturnedNotNull', 'pca_comp_50', 'pca_comp_47', 'SellingLocation_numReturnedNotNull', 'pca_comp_35', 'pca_comp_13', 'JDPowersCat_VAN', 'pca_comp_63', 'pca_comp_09', 'pca_comp_68', 'JDPowersCat_numReturnedasm1', 'pca_comp_18', 'VIN_fracDSEligible1DivTransactions', 'BuyerID_numDSEligible1', 'SellingLocation_numTransactions', 'pca_comp_53', 'pca_comp_27', 'pca_comp_69', 'LIGHTR', 'pca_comp_49', 'SellingLocation_numDSEligible1', 'SellerID_numReturnedNotNull', 'pca_comp_46', 'pca_comp_08', 'CarYear', 'pca_comp_32', 'pca_comp_11', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'pca_comp_12', 'pca_comp_37', 'pca_comp_29', 'pca_comp_10', 'BuyerID_numReturned1', 'SellerID_fracDSEligible1DivTransactions', 'pca_comp_48', 'SalePrice', 'pca_comp_62', 'pca_cluster_1_dist', 'pca_comp_45', 'pca_comp_24', 'pca_comp_71', 'pca_comp_16', 'JDPowersCat_fracDSEligible1DivTransactions', 'SellerID_numDSEligible1', 'Autocheck_score', 'SaleDate_doy', 'SaleDate_decyear', 'pca_comp_05', 'JDPowersCat_numReturned1', 'pca_comp_56', 'pca_comp_04', 'pca_comp_22', 'BuyerID_numTransactions', 'SellingLocation_fracReturned1DivReturnedNotNull', 'BuyerID_fracDSEligible1DivTransactions', 'pca_comp_26', 'pca_comp_42', 'pca_comp_43', 'pca_comp_58', 'CarMake_fracReturnedNotNullDivDSEligible1', 'Arbitrated', 'pca_comp_17', 'JDPowersCat_fracReturned1DivReturnedNotNull', 'SaleDate_dow', 'pca_comp_07', 'JDPowersCat_numReturnedNotNull', 'BuyerID_fracReturnedasm1DivTransactions', 'pca_comp_30', 'pca_comp_65', 'pca_comp_15', 'pca_comp_54', 'SellerID_fracReturnedasm1DivTransactions']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  Returned_asm  \
count                            5919.000000   5919.000000   
mean                                0.118587      0.089035   
std                                 0.318143      0.284819   
min                                 0.000000      0.000000   
25%                                 0.000000      0.000000   
50%                                 0.000000      0.000000   
75%                                 0.000000      0.000000   
max                                 1.000000      1.000000   

       VIN_numReturnedNotNull   pca_comp_66  \
count             5919.000000  5.919000e+03   
mean                 0.128569 -1.726563e-17   
std                  0.343717  1.124832e-02   
min                  0.000000 -2.498508e-01   
25%                  0.000000 -1.036708e-03   
50%                  0.000000 -6.866202e-06   
75%                  0.000000  9.006167e-04   
max                  2.000000  1.205782e-01   

       VIN_fracReturned1DivReturnedNotNull  
count                          5919.000000  
mean                              0.029904  
std                               0.169341  
min                               0.000000  
25%                               0.000000  
50%                               0.000000  
75%                               0.000000  
max                               1.000000  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.997
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.988
Chunk:
Evaluation time span: 2013-09-29 00:00:00 2013-10-05 00:00:00
Model return rate:    0.0
Original return rate: 0.236694677871
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-10-05 00:00:00
Model return rate:    0.120741743932
Original return rate: 0.222978566829
########################################
Timestamp: 2017-03-02T10:10:08GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_fracReturned1DivReturnedNotNull', 'VIN_numReturnedasm1', 'VIN_numReturnedNotNull', 'VIN_fracReturnedasm1DivTransactions', 'pca_comp_60', 'VIN_numReturned1', 'pca_comp_24', 'pca_comp_14', 'pca_comp_58', 'DSEligible', 'SellerID_fracReturned1DivReturnedNotNull', 'pca_comp_38', 'Returned_asm', 'pca_comp_37', 'pca_comp_67', 'pca_comp_48', 'pca_comp_52', 'pca_comp_63', 'SellerID_fracReturnedNotNullDivDSEligible1', 'VIN_numTransactions', 'pca_comp_36', 'pca_comp_34', 'pca_comp_55', 'pca_comp_57', 'pca_comp_56', 'pca_comp_00', 'BuyerID_numReturned1', 'pca_comp_21', 'SaleDate_day', 'SaleDate_dow', 'SalePrice', 'SellerID_fracReturnedasm1DivTransactions', 'SaleDate_decyear', 'pca_comp_53', 'JDPowersCat_PICKUP', 'pca_cluster_0_dist', 'Autocheck_score', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'SellingLocation_fracDSEligible1DivTransactions', 'pca_comp_30', 'JDPowersCat_MIDSIZECAR', 'MMR', 'pca_comp_02', 'pca_comp_25', 'pca_comp_42', 'SellingLocation_numReturnedasm1', 'pca_comp_29', 'pca_comp_07', 'BuyerID_fracReturned1DivReturnedNotNull', 'VIN_fracDSEligible1DivTransactions', 'BuyerID_numReturnedasm1', 'pca_comp_18', 'SaleDate_doy', 'cluster_0_dist', 'SellingLocation_fracReturnedasm1DivTransactions', 'SellingLocation_numReturnedNotNull', 'pca_comp_22', 'pca_comp_03', 'pca_comp_10', 'pca_comp_19', 'pca_comp_41', 'SellerID_fracDSEligible1DivTransactions', 'pca_comp_12', 'SellingLocation_numDSEligible1', 'SellingLocation_fracReturned1DivReturnedNotNull', 'pca_comp_13', 'pca_comp_65', 'CarYear', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'pca_comp_68', 'CarMake_fracReturnedNotNullDivDSEligible1', 'pca_comp_50', 'SellerID_numTransactions', 'pca_comp_01', 'CarMake_fracReturnedasm1DivTransactions', 'VIN_numDSEligible1', 'pca_cluster_1_dist', 'pca_comp_46', 'BuyerID_numReturnedNotNull', 'pca_comp_40', 'pca_comp_27']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  \
count                            5821.000000   
mean                                0.098279   
std                                 0.291236   
min                                 0.000000   
25%                                 0.000000   
50%                                 0.000000   
75%                                 0.000000   
max                                 1.000000   

       VIN_fracReturned1DivReturnedNotNull  VIN_numReturnedasm1  \
count                          5821.000000          5821.000000   
mean                              0.010222             0.108057   
std                               0.099950             0.316508   
min                               0.000000             0.000000   
25%                               0.000000             0.000000   
50%                               0.000000             0.000000   
75%                               0.000000             0.000000   
max                               1.000000             2.000000   

       VIN_numReturnedNotNull  VIN_fracReturnedasm1DivTransactions  
count             5821.000000                          5821.000000  
mean                 0.107542                             0.093211  
std                  0.314779                             0.279869  
min                  0.000000                             0.000000  
25%                  0.000000                             0.000000  
50%                  0.000000                             0.000000  
75%                  0.000000                             0.000000  
max                  3.000000                             1.000000  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.987
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.450
Chunk:
Evaluation time span: 2013-10-06 00:00:00 2013-10-12 00:00:00
Model return rate:    0.260740740741
Original return rate: 0.260740740741
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-10-12 00:00:00
Model return rate:    0.129939653494
Original return rate: 0.225186211675
########################################
Timestamp: 2017-03-02T10:10:17GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'Returned_asm', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_64', 'DSEligible', 'pca_comp_61', 'pca_comp_59', 'VIN_numReturned1', 'VIN_numReturnedasm1', 'pca_comp_66', 'VIN_fracReturnedasm1DivTransactions', 'pca_comp_52', 'BuyerID_fracReturned1DivReturnedNotNull', 'pca_comp_37', 'pca_comp_46', 'pca_comp_31', 'pca_comp_02', 'pca_comp_36', 'Arbitrated', 'pca_comp_38', 'cluster_1_dist', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'BuyerID_numTransactions', 'pca_comp_32', 'cluster_0_dist', 'SellingLocation_numDSEligible1', 'VIN_numTransactions', 'BuyerID_numDSEligible1', 'pca_comp_45', 'pca_comp_67', 'SellingLocation_fracReturned1DivReturnedNotNull', 'pca_comp_65', 'pca_comp_69', 'pca_comp_51', 'VIN_numDSEligible1', 'pca_cluster_0_dist', 'SellingLocation_numReturnedNotNull', 'pca_comp_20', 'pca_comp_26', 'pca_comp_01', 'pca_comp_06', 'SellingLocation_numReturned1', 'SellingLocation_numReturnedasm1', 'pca_comp_33', 'pca_comp_30', 'pca_comp_57', 'SellerID_fracReturnedNotNullDivDSEligible1', 'BuyerID_numReturnedNotNull', 'pca_comp_34', 'pca_comp_12', 'SellerID_numDSEligible1', 'pca_comp_42', 'pca_comp_58', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'SellingLocation_numTransactions', 'pca_comp_47', 'SellingLocation_fracDSEligible1DivTransactions', 'JDPowersCat_SUV', 'SellerID_numReturnedNotNull', 'pca_comp_22', 'pca_comp_24', 'CarYear', 'pca_comp_10', 'pca_comp_23', 'CarMake_fracReturned1DivReturnedNotNull']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            5924.000000             5924.000000   
mean                                0.115856                0.125591   
std                                 0.314463                0.337973   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                2.000000   

       Returned_asm  VIN_fracReturned1DivReturnedNotNull   pca_comp_64  
count   5924.000000                          5924.000000  5.924000e+03  
mean       0.087441                             0.037643  2.415318e-17  
std        0.282504                             0.189682  1.449465e-02  
min        0.000000                             0.000000 -2.194927e-01  
25%        0.000000                             0.000000 -1.527965e-03  
50%        0.000000                             0.000000 -5.345464e-05  
75%        0.000000                             0.000000  1.274495e-03  
max        1.000000                             1.000000  2.330418e-01  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.995
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.995
Chunk:
Evaluation time span: 2013-10-13 00:00:00 2013-10-19 00:00:00
Model return rate:    0.0
Original return rate: 0.280575539568
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-10-19 00:00:00
Model return rate:    0.123909411546
Original return rate: 0.228331018708
########################################
Timestamp: 2017-03-02T10:10:26GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'VIN_fracReturned1DivReturnedNotNull', 'VIN_numReturnedasm1', 'VIN_fracReturnedasm1DivTransactions', 'Returned_asm', 'pca_comp_56', 'DSEligible', 'pca_comp_53', 'pca_comp_59', 'VIN_numReturned1', 'pca_comp_63', 'pca_comp_49', 'pca_comp_15', 'pca_comp_34', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'BuyerID_numDSEligible1', 'pca_comp_57', 'SellerID_fracReturned1DivReturnedNotNull', 'pca_comp_37', 'pca_comp_13', 'BuyerID_numTransactions', 'BuyerID_fracReturned1DivReturnedNotNull', 'pca_comp_66', 'PSI', 'VIN_numDSEligible1', 'pca_comp_04', 'CarMake_numReturnedNotNull', 'pca_comp_40', 'Arbitrated', 'pca_comp_48', 'pca_comp_67', 'pca_comp_02', 'CarMake_numReturned1', 'pca_comp_29', 'pca_comp_31', 'pca_comp_70', 'pca_comp_33', 'SellingLocation_numTransactions', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'SellingLocation_fracDSEligible1DivTransactions', 'VIN_numTransactions', 'SellingLocation_lat', 'CarMake_numTransactions', 'pca_comp_43', 'pca_comp_62', 'SellingLocation_numReturned1', 'pca_comp_65', 'pca_comp_01', 'CarMake_fracReturnedasm1DivTransactions', 'pca_comp_68', 'JDPowersCat_numTransactions', 'CarMake_fracReturnedNotNullDivDSEligible1', 'pca_comp_23', 'JDPowersCat_numDSEligible1', 'CarMake_fracDSEligible1DivTransactions', 'pca_comp_36', 'CarMake_numDSEligible1', 'JDPowersCat_numReturnedasm1', 'pca_comp_03', 'pca_comp_58', 'CarYear', 'CarMake_numReturnedasm1', 'pca_comp_61', 'SellingLocation_numReturnedasm1', 'JDPowersCat_SUV', 'MMR', 'JDPowersCat_SPORTSCAR', 'pca_comp_44', 'JDPowersCat_fracReturnedasm1DivTransactions', 'JDPowersCat_numReturned1', 'Mileage', 'pca_comp_07', 'CarMake_fracReturned1DivReturnedNotNull', 'pca_cluster_0_dist', 'pca_comp_11', 'pca_comp_17', 'pca_cluster', 'SellerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_54', 'JDPowersCat_numReturnedNotNull']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            5737.000000             5737.000000   
mean                                0.090030                0.096915   
std                                 0.282151                0.302856   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                2.000000   

       VIN_fracReturned1DivReturnedNotNull  VIN_numReturnedasm1  \
count                          5737.000000          5737.000000   
mean                              0.006014             0.100924   
std                               0.075322             0.306418   
min                               0.000000             0.000000   
25%                               0.000000             0.000000   
50%                               0.000000             0.000000   
75%                               0.000000             0.000000   
max                               1.000000             2.000000   

       VIN_fracReturnedasm1DivTransactions  
count                          5737.000000  
mean                              0.089550  
std                               0.277042  
min                               0.000000  
25%                               0.000000  
50%                               0.000000  
75%                               0.000000  
max                               1.000000  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.985
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.526
Chunk:
Evaluation time span: 2013-10-20 00:00:00 2013-10-26 00:00:00
Model return rate:    0.238993710692
Original return rate: 0.238993710692
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-10-26 00:00:00
Model return rate:    0.131817788919
Original return rate: 0.228981282602
########################################
Timestamp: 2017-03-02T10:10:35GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'Returned_asm', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_63', 'pca_comp_67', 'pca_comp_60', 'VIN_numReturned1', 'pca_comp_61', 'Arbitrated', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'VIN_numTransactions', 'VIN_numDSEligible1', 'SellerID_fracReturned1DivReturnedNotNull', 'pca_comp_25', 'pca_comp_00', 'VIN_fracReturnedasm1DivTransactions', 'pca_comp_54', 'VIN_numReturnedasm1', 'BuyerID_numReturnedNotNull', 'VIN_fracDSEligible1DivTransactions', 'pca_comp_42', 'pca_comp_46', 'pca_comp_05', 'pca_comp_40', 'pca_comp_06', 'pca_comp_30', 'pca_comp_34', 'MMR', 'pca_comp_59', 'pca_comp_53']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            6152.000000             6152.000000   
mean                                0.130188                0.141905   
std                                 0.330630                0.359536   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                3.000000   

       Returned_asm  VIN_fracReturned1DivReturnedNotNull   pca_comp_63  
count   6152.000000                          6152.000000  6.152000e+03  
mean       0.093791                             0.038064 -1.378770e-17  
std        0.291561                             0.190312  1.996559e-02  
min        0.000000                             0.000000 -4.822711e-01  
25%        0.000000                             0.000000 -3.113831e-03  
50%        0.000000                             0.000000  2.611920e-05  
75%        0.000000                             0.000000  3.063983e-03  
max        1.000000                             1.000000  2.100999e-01  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.993
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.983
Chunk:
Evaluation time span: 2013-10-27 00:00:00 2013-11-02 00:00:00
Model return rate:    0.0
Original return rate: 0.236220472441
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-11-02 00:00:00
Model return rate:    0.125504073739
Original return rate: 0.22938106972
########################################
Timestamp: 2017-03-02T10:10:45GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'VIN_numReturnedasm1', 'VIN_fracReturnedasm1DivTransactions', 'DSEligible', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_63', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'Returned_asm', 'pca_comp_38', 'pca_comp_51', 'pca_comp_55', 'pca_comp_69', 'VIN_numReturned1', 'pca_comp_40', 'pca_comp_54', 'VIN_numTransactions', 'pca_comp_57', 'pca_comp_58', 'pca_comp_33', 'pca_comp_48', 'pca_comp_03', 'pca_comp_70', 'pca_comp_26', 'pca_comp_59', 'pca_cluster_1_dist', 'pca_comp_15', 'BuyerID_numReturned1', 'pca_comp_65', 'BuyerID_numReturnedNotNull', 'cluster_1_dist', 'SellerID_numReturnedasm1', 'cluster_0_dist', 'pca_comp_01', 'SellerID_numReturned1', 'pca_comp_21', 'pca_comp_00', 'InLane', 'pca_comp_47', 'pca_cluster_0_dist', 'pca_comp_19', 'CarYear', 'pca_comp_44', 'pca_comp_50', 'pca_comp_64', 'pca_comp_34', 'pca_comp_72', 'pca_comp_62', 'pca_comp_18', 'BuyerID_numDSEligible1', 'pca_comp_27', 'pca_comp_24', 'pca_comp_10', 'pca_comp_17', 'pca_comp_73', 'pca_comp_32', 'pca_comp_13', 'CarMake_numReturned1', 'CarMake_numReturnedasm1', 'pca_comp_37', 'BuyerID_fracReturnedasm1DivTransactions', 'pca_comp_49', 'pca_comp_68', 'SellerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_76', 'SaleDate_decyear', 'pca_comp_41', 'BuyerID_fracReturned1DivReturnedNotNull', 'SellerID_numDSEligible1', 'VIN_numDSEligible1', 'SellerID_fracReturned1DivReturnedNotNull', 'JDPowersCat_MIDSIZECAR', 'pca_comp_08', 'pca_comp_60', 'pca_comp_67', 'pca_comp_53', 'Mileage', 'pca_comp_56', 'pca_comp_29', 'pca_comp_52', 'pca_comp_11', 'pca_comp_30', 'MMR', 'BuyerID_numReturnedasm1', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'VIN_fracDSEligible1DivTransactions', 'pca_comp_07', 'BuyerID_numTransactions', 'JDPowersCat_fracReturnedasm1DivTransactions', 'JDPowersCat_COMPACTCAR', 'pca_comp_06', 'pca_comp_04', 'pca_comp_25', 'Autocheck_score', 'JDPowersCat_fracReturnedNotNullDivDSEligible1', 'JDPowersCat_LUXURYCAR', 'JDPowersCat_numTransactions', 'SellingLocation_numReturnedNotNull', 'pca_comp_45']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            5519.000000             5519.000000   
mean                                0.107477                0.114513   
std                                 0.306044                0.325777   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                2.000000   

       VIN_numReturnedasm1  VIN_fracReturnedasm1DivTransactions   DSEligible  
count          5519.000000                          5519.000000  5519.000000  
mean              0.106179                             0.097457     0.906686  
std               0.312184                             0.290763     0.290898  
min               0.000000                             0.000000     0.000000  
25%               0.000000                             0.000000     1.000000  
50%               0.000000                             0.000000     1.000000  
75%               0.000000                             0.000000     1.000000  
max               2.000000                             1.000000     1.000000  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.994
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.658
Chunk:
Evaluation time span: 2013-11-03 00:00:00 2013-11-09 00:00:00
Model return rate:    0.179761904762
Original return rate: 0.179761904762
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-11-09 00:00:00
Model return rate:    0.129012393195
Original return rate: 0.226533679464
########################################
Timestamp: 2017-03-02T10:10:55GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'Returned_asm', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_64', 'pca_comp_69', 'pca_comp_71', 'pca_comp_66', 'VIN_numReturned1', 'DSEligible', 'pca_comp_33', 'pca_comp_38', 'pca_cluster_0_dist', 'pca_comp_60', 'pca_comp_41', 'BuyerID_numReturnedNotNull', 'pca_comp_42', 'cluster_0_dist', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_57', 'pca_comp_09', 'Autocheck_score', 'VIN_numTransactions', 'BuyerID_numReturned1', 'VIN_fracReturnedasm1DivTransactions', 'pca_comp_10', 'pca_comp_52', 'PSI', 'cluster_1_dist', 'pca_comp_34', 'VIN_numReturnedasm1', 'pca_cluster_1_dist', 'pca_comp_49', 'pca_comp_30', 'pca_comp_54', 'pca_comp_35', 'pca_comp_65', 'pca_comp_51', 'pca_comp_46', 'pca_comp_00', 'Mileage', 'pca_comp_01', 'SellerID_fracReturned1DivReturnedNotNull', 'pca_comp_72', 'pca_comp_31', 'pca_comp_48', 'JDPowersCat_SPORTSCAR', 'BuyerID_numTransactions', 'BuyerID_fracReturned1DivReturnedNotNull', 'pca_comp_07', 'CarYear', 'BuyerID_numReturnedasm1', 'pca_comp_67', 'VIN_fracDSEligible1DivTransactions', 'SellerID_fracReturnedNotNullDivDSEligible1', 'BuyerID_fracReturnedasm1DivTransactions', 'pca_comp_70', 'JDPowersCat_numTransactions', 'pca_comp_06', 'pca_comp_37', 'pca_comp_74', 'pca_comp_43', 'pca_comp_08', 'pca_comp_40', 'pca_comp_75', 'MMR', 'SellingLocation_numReturnedNotNull', 'pca_comp_62', 'VIN_numDSEligible1', 'pca_comp_76', 'BuyerID_fracDSEligible1DivTransactions', 'JDPowersCat_fracReturnedNotNullDivDSEligible1', 'pca_comp_21', 'BuyerID_numDSEligible1', 'pca_comp_45', 'CarMake_fracReturnedasm1DivTransactions', 'pca_comp_25', 'pca_comp_04', 'pca_comp_44', 'JDPowersCat_numDSEligible1', 'pca_comp_18', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'pca_comp_02', 'pca_comp_19', 'SellerID_numReturned1', 'pca_comp_14', 'SalePrice', 'pca_comp_77', 'pca_comp_22', 'CarMake_fracDSEligible1DivTransactions', 'pca_comp_13', 'pca_comp_58', 'pca_comp_05', 'pca_comp_32', 'pca_comp_55', 'pca_comp_50']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            6090.000000             6090.000000   
mean                                0.138820                0.147619   
std                                 0.341631                0.362536   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                2.000000   

       Returned_asm  VIN_fracReturned1DivReturnedNotNull   pca_comp_64  
count   6090.000000                          6090.000000  6.090000e+03  
mean       0.082430                             0.030706  1.381968e-17  
std        0.275042                             0.171580  3.148035e-02  
min        0.000000                             0.000000 -2.345634e-01  
25%        0.000000                             0.000000 -4.228208e-03  
50%        0.000000                             0.000000  1.266129e-03  
75%        0.000000                             0.000000  5.727146e-03  
max        1.000000                             1.000000  6.151397e-01  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.989
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.976
Chunk:
Evaluation time span: 2013-11-10 00:00:00 2013-11-16 00:00:00
Model return rate:    0.0
Original return rate: 0.178717598909
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-11-16 00:00:00
Model return rate:    0.123407701937
Original return rate: 0.224253464316
########################################
Timestamp: 2017-03-02T10:11:06GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'VIN_fracReturned1DivReturnedNotNull', 'VIN_numReturnedasm1', 'pca_comp_64', 'VIN_fracReturnedasm1DivTransactions', 'DSEligible', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'Returned_asm', 'pca_comp_55', 'pca_comp_41', 'pca_comp_50', 'pca_comp_61', 'pca_comp_39', 'VIN_numReturned1', 'pca_comp_09', 'SellerID_numTransactions', 'pca_comp_06', 'pca_comp_68', 'pca_comp_70', 'SellerID_numDSEligible1', 'cluster_0_dist', 'pca_cluster_0_dist', 'pca_comp_00', 'BuyerID_numReturnedNotNull', 'BuyerID_fracDSEligible1DivTransactions', 'CarYear', 'pca_comp_37', 'pca_comp_58', 'CarMake_numReturnedasm1', 'CarMake_numReturnedNotNull', 'pca_comp_67', 'pca_comp_20', 'CarMake_numReturned1', 'pca_comp_13', 'BuyerID_numReturned1', 'pca_comp_48', 'CarMake_numDSEligible1', 'pca_comp_11', 'pca_comp_10', 'pca_comp_23', 'pca_comp_45', 'BuyerID_fracReturned1DivReturnedNotNull', 'VIN_numDSEligible1', 'pca_comp_36', 'VIN_numTransactions', 'pca_comp_16', 'pca_cluster_1_dist', 'VIN_fracDSEligible1DivTransactions', 'pca_comp_62', 'CarMake_fracReturnedasm1DivTransactions', 'pca_comp_21', 'pca_comp_33', 'pca_comp_28', 'SellerID_fracReturned1DivReturnedNotNull', 'pca_comp_66', 'pca_comp_53', 'CarMake_fracDSEligible1DivTransactions', 'Mileage', 'pca_comp_05', 'pca_comp_04', 'pca_comp_71', 'CarMake_numTransactions', 'BuyerID_numTransactions', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'pca_comp_59', 'SellingLocation_lat', 'pca_comp_43', 'pca_comp_08', 'pca_comp_44', 'pca_comp_24', 'pca_comp_52', 'SaleDate_decyear', 'pca_comp_69', 'SellerID_fracDSEligible1DivTransactions', 'pca_comp_60', 'SellerID_fracReturnedasm1DivTransactions', 'SellerID_fracReturnedNotNullDivDSEligible1', 'SellingLocation_numTransactions', 'pca_comp_19', 'JDPowersCat_MIDSIZECAR', 'pca_comp_02', 'SalePrice', 'JDPowersCat_numTransactions', 'pca_comp_56', 'SellerID_numReturnedNotNull', 'ConditionReport', 'pca_comp_57', 'pca_comp_22', 'pca_comp_46', 'SellingLocation_fracReturnedasm1DivTransactions', 'Simulcast', 'pca_comp_29', 'pca_comp_18', 'Autocheck_score', 'BuyerID_fracReturnedasm1DivTransactions', 'pca_comp_12', 'cluster_1_dist', 'pca_comp_25', 'pca_comp_51', 'SellerID_numReturned1', 'BuyerID_numReturnedasm1']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            6190.000000             6190.000000   
mean                                0.098613                0.107108   
std                                 0.292975                0.315483   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                2.000000   

       VIN_fracReturned1DivReturnedNotNull  VIN_numReturnedasm1   pca_comp_64  
count                          6190.000000          6190.000000  6.190000e+03  
mean                              0.008078             0.103069  4.269339e-18  
std                               0.087695             0.311425  3.184858e-02  
min                               0.000000             0.000000 -5.841709e-01  
25%                               0.000000             0.000000 -4.342334e-03  
50%                               0.000000             0.000000 -1.965280e-03  
75%                               0.000000             0.000000  1.178312e-03  
max                               1.000000             2.000000  3.681335e-01  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.988
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.597
Chunk:
Evaluation time span: 2013-11-17 00:00:00 2013-11-23 00:00:00
Model return rate:    0.183979974969
Original return rate: 0.183979974969
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-11-23 00:00:00
Model return rate:    0.126773296245
Original return rate: 0.222263450835
########################################
Timestamp: 2017-03-02T10:11:15GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'Returned_asm', 'pca_comp_68', 'pca_comp_71', 'pca_comp_66', 'InLane', 'DSEligible', 'pca_comp_65', 'VIN_fracReturned1DivReturnedNotNull', 'SellerID_fracDSEligible1DivTransactions', 'Arbitrated', 'pca_comp_51', 'pca_comp_61', 'cluster_0_dist', 'pca_comp_22', 'VIN_numDSEligible1', 'pca_comp_37', 'pca_cluster_0_dist', 'VIN_numReturned1', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'BuyerID_fracDSEligible1DivTransactions', 'BuyerID_numReturnedNotNull', 'VIN_numReturnedasm1', 'pca_comp_03', 'pca_comp_16', 'VIN_fracReturnedasm1DivTransactions', 'pca_comp_14', 'pca_comp_54', 'pca_comp_26', 'pca_comp_00', 'pca_comp_64', 'SellingLocation_fracReturned1DivReturnedNotNull', 'cluster_1_dist', 'pca_comp_40', 'pca_comp_50', 'SellerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_43', 'pca_comp_74', 'pca_comp_25', 'JDPowersCat_numReturnedasm1', 'pca_cluster_1_dist', 'pca_comp_76', 'pca_comp_33', 'pca_comp_49', 'SellerID_numReturnedasm1', 'CarMake_numReturnedNotNull', 'pca_comp_11', 'SellingLocation_lat', 'pca_comp_32', 'pca_comp_07', 'pca_comp_69', 'pca_comp_27', 'Mileage', 'CarMake_numTransactions', 'pca_comp_62', 'pca_comp_23', 'SellingLocation_numReturned1', 'CarMake_fracReturnedNotNullDivDSEligible1', 'SellingLocation_numReturnedasm1', 'JDPowersCat_numDSEligible1', 'SellerID_numReturned1', 'pca_comp_01', 'pca_comp_44', 'pca_comp_48', 'CarMake_numReturned1', 'pca_comp_52', 'SellerID_fracReturned1DivReturnedNotNull', 'SellingLocation_numReturnedNotNull', 'SellerID_numTransactions', 'pca_comp_59', 'SellingLocation_numDSEligible1', 'pca_comp_67', 'CarMake_fracReturned1DivReturnedNotNull', 'pca_comp_73', 'PSI', 'pca_comp_60', 'pca_comp_06', 'BuyerID_numReturned1', 'pca_comp_15', 'BuyerID_numTransactions']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            6301.000000             6301.000000   
mean                                0.128868                0.137915   
std                                 0.330203                0.351674   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                2.000000   

       Returned_asm   pca_comp_68   pca_comp_71  
count   6301.000000  6.301000e+03  6.301000e+03  
mean       0.081416 -1.407876e-17 -1.256098e-17  
std        0.273494  1.787217e-02  1.229425e-02  
min        0.000000 -3.729362e-01 -1.994148e-01  
25%        0.000000 -1.428422e-03 -1.757616e-03  
50%        0.000000  2.584383e-04 -1.246202e-04  
75%        0.000000  1.824388e-03  1.313047e-03  
max        1.000000  3.348512e-01  1.809703e-01  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.990
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.970
Chunk:
Evaluation time span: 2013-11-24 00:00:00 2013-11-30 00:00:00
Model return rate:    0.0
Original return rate: 0.181818181818
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-11-30 00:00:00
Model return rate:    0.123988301707
Original return rate: 0.221296631655
########################################
Timestamp: 2017-03-02T10:11:26GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'VIN_fracReturned1DivReturnedNotNull', 'VIN_fracReturnedasm1DivTransactions', 'pca_comp_63', 'VIN_numReturnedasm1', 'pca_comp_56', 'VIN_numReturned1', 'BuyerID_fracReturned1DivReturnedNotNull', 'Returned_asm', 'DSEligible', 'pca_comp_61', 'pca_comp_54', 'pca_comp_68', 'Autocheck_score', 'pca_comp_40', 'pca_comp_53', 'SalePrice', 'pca_comp_19', 'pca_comp_58', 'pca_comp_04', 'pca_comp_14', 'pca_comp_55', 'SellerID_numReturned1', 'pca_comp_02', 'pca_comp_72', 'pca_comp_75', 'SellerID_numTransactions', 'SellerID_fracDSEligible1DivTransactions', 'pca_comp_35', 'CarMake_numReturnedNotNull', 'pca_comp_25', 'pca_comp_60', 'CarMake_numTransactions', 'pca_comp_22', 'pca_comp_71', 'SellerID_numDSEligible1', 'VIN_numDSEligible1', 'Mileage', 'JDPowersCat_fracReturnedNotNullDivDSEligible1', 'BuyerID_fracReturnedasm1DivTransactions', 'pca_comp_62', 'CarYear', 'pca_comp_08', 'pca_comp_26', 'SellerID_fracReturned1DivReturnedNotNull', 'pca_comp_28', 'pca_comp_41', 'SellingLocation_fracDSEligible1DivTransactions', 'SellerID_numReturnedasm1', 'pca_comp_31', 'pca_comp_11', 'pca_comp_64', 'VIN_numTransactions', 'SellerID_numReturnedNotNull', 'pca_comp_27', 'pca_comp_23', 'pca_comp_24', 'pca_comp_10', 'pca_comp_07', 'CarMake_numDSEligible1', 'pca_comp_29', 'JDPowersCat_SUV', 'pca_comp_59', 'SellingLocation_lon', 'pca_comp_49', 'pca_comp_69', 'pca_comp_52']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            3574.000000             3574.000000   
mean                                0.089955                0.098769   
std                                 0.279902                0.302122   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                2.000000   

       VIN_fracReturned1DivReturnedNotNull  \
count                          3574.000000   
mean                              0.006156   
std                               0.077327   
min                               0.000000   
25%                               0.000000   
50%                               0.000000   
75%                               0.000000   
max                               1.000000   

       VIN_fracReturnedasm1DivTransactions   pca_comp_63  
count                          3574.000000  3.574000e+03  
mean                              0.093593  1.419292e-17  
std                               0.284634  3.166459e-02  
min                               0.000000 -5.151490e-01  
25%                               0.000000 -5.959335e-03  
50%                               0.000000 -2.113078e-03  
75%                               0.000000  3.029562e-03  
max                               1.000000  1.830524e-01  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.998
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.528
Chunk:
Evaluation time span: 2013-12-01 00:00:00 2013-12-07 00:00:00
Model return rate:    0.175675675676
Original return rate: 0.175675675676
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-12-07 00:00:00
Model return rate:    0.126465065078
Original return rate: 0.219345891598
########################################
Timestamp: 2017-03-02T10:11:36GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'Returned_asm', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_64', 'pca_comp_52', 'pca_comp_65', 'pca_comp_69', 'VIN_numReturned1', 'BuyerID_fracReturned1DivReturnedNotNull', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_70', 'pca_comp_60', 'pca_comp_28', 'pca_comp_58', 'pca_comp_23', 'pca_comp_44', 'pca_comp_08', 'PSI', 'pca_comp_30', 'pca_comp_02', 'DSEligible', 'SellerID_numReturned1', 'SellerID_numReturnedasm1', 'pca_comp_09', 'pca_comp_38', 'PSIEligible', 'pca_comp_07', 'VIN_fracReturnedasm1DivTransactions', 'cluster_1_dist', 'InLane', 'pca_comp_18', 'pca_comp_27', 'CarMake_numDSEligible1', 'SellerID_numReturnedNotNull', 'CarMake_fracReturned1DivReturnedNotNull', 'pca_comp_66', 'VIN_numReturnedasm1', 'BuyerID_numTransactions', 'VIN_numDSEligible1', 'CarMake_numReturnedasm1', 'pca_comp_00', 'pca_comp_33', 'CarMake_numReturnedNotNull', 'pca_comp_11', 'pca_comp_42', 'VIN_numTransactions', 'CarMake_numReturned1', 'pca_comp_34', 'pca_comp_12', 'pca_comp_06', 'pca_comp_05', 'pca_comp_41', 'pca_comp_21', 'pca_comp_57', 'pca_comp_63', 'ConditionReport', 'pca_comp_37', 'pca_comp_15', 'Autocheck_score', 'pca_comp_76', 'SellerID_numTransactions', 'pca_comp_46', 'pca_comp_10', 'pca_comp_01', 'pca_comp_14', 'pca_comp_62', 'pca_comp_48', 'pca_comp_75', 'pca_cluster_1_dist', 'SellerID_fracDSEligible1DivTransactions', 'pca_comp_55', 'pca_comp_50', 'CarMake_numTransactions', 'BuyerID_numReturnedNotNull', 'pca_comp_19', 'pca_comp_61', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'pca_comp_03', 'pca_comp_54', 'pca_comp_35', 'pca_comp_51', 'SellingLocation_fracReturnedasm1DivTransactions', 'SellingLocation_fracReturned1DivReturnedNotNull', 'BuyerID_numReturned1', 'SalePrice', 'BuyerID_fracDSEligible1DivTransactions', 'CarMake_fracDSEligible1DivTransactions', 'pca_comp_67']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            6150.000000             6150.000000   
mean                                0.119444                0.127154   
std                                 0.319437                0.337055   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                3.000000   

       Returned_asm  VIN_fracReturned1DivReturnedNotNull   pca_comp_64  
count   6150.000000                          6150.000000  6.150000e+03  
mean       0.071057                             0.025637 -2.857075e-18  
std        0.256941                             0.157432  2.726085e-02  
min        0.000000                             0.000000 -2.677673e-01  
25%        0.000000                             0.000000 -3.248264e-03  
50%        0.000000                             0.000000  1.086404e-03  
75%        0.000000                             0.000000  4.289136e-03  
max        1.000000                             1.000000  6.152466e-01  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.996
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.972
Chunk:
Evaluation time span: 2013-12-09 00:00:00 2013-12-14 00:00:00
Model return rate:    0.0
Original return rate: 0.170186335404
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-12-14 00:00:00
Model return rate:    0.121221525666
Original return rate: 0.217160841478
########################################
Timestamp: 2017-03-02T10:11:46GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'VIN_fracReturned1DivReturnedNotNull', 'VIN_numReturned1', 'Returned_asm', 'VIN_numTransactions', 'DSEligible', 'pca_comp_63', 'pca_comp_61', 'SellerID_fracDSEligible1DivTransactions', 'pca_comp_67', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_01', 'pca_comp_69', 'pca_comp_21', 'SellerID_fracReturned1DivReturnedNotNull', 'pca_comp_54', 'SellerID_fracReturnedasm1DivTransactions', 'pca_comp_14', 'BuyerID_numReturned1', 'VIN_numReturnedasm1', 'JDPowersCat_PICKUP', 'SellerID_numTransactions', 'SellerID_numDSEligible1', 'Simulcast', 'pca_comp_41', 'pca_comp_42', 'pca_comp_66', 'cluster_1_dist', 'pca_comp_18', 'InLane', 'pca_comp_04', 'pca_comp_00', 'BuyerID_fracReturned1DivReturnedNotNull', 'pca_comp_12', 'BuyerID_numReturnedNotNull', 'VIN_fracReturnedasm1DivTransactions', 'pca_comp_38', 'pca_comp_39', 'pca_comp_57', 'SellerID_numReturned1', 'pca_comp_09', 'VIN_numDSEligible1', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'pca_comp_64', 'pca_comp_17', 'pca_comp_33', 'pca_comp_20', 'pca_comp_16', 'pca_comp_73']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            6193.000000             6193.000000   
mean                                0.111427                0.118682   
std                                 0.310385                0.328396   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                2.000000   

       VIN_fracReturned1DivReturnedNotNull  VIN_numReturned1  Returned_asm  
count                          6193.000000       6193.000000   6193.000000  
mean                              0.007186          0.007751      0.078960  
std                               0.082779          0.087703      0.269698  
min                               0.000000          0.000000      0.000000  
25%                               0.000000          0.000000      0.000000  
50%                               0.000000          0.000000      0.000000  
75%                               0.000000          0.000000      0.000000  
max                               1.000000          1.000000      1.000000  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.990
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.650
Chunk:
Evaluation time span: 2013-12-15 00:00:00 2013-12-20 00:00:00
Model return rate:    0.144144144144
Original return rate: 0.144144144144
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-12-20 00:00:00
Model return rate:    0.122699030254
Original return rate: 0.212944175641
########################################
Timestamp: 2017-03-02T10:11:57GMT
`columns.pkl`, `transformer_scaler.pkl`, `transformer_pca.pkl`: Save column order and transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Save transformers.
`important_features` =
['VIN_fracReturnedNotNullDivDSEligible1', 'VIN_numReturnedNotNull', 'Returned_asm', 'VIN_fracReturned1DivReturnedNotNull', 'pca_comp_71', 'pca_comp_64', 'pca_comp_65', 'pca_comp_67', 'DSEligible', 'VIN_numReturned1', 'BuyerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_07', 'pca_comp_43', 'BuyerID_numReturnedasm1', 'BuyerID_fracDSEligible1DivTransactions', 'Mileage', 'Simulcast', 'VIN_numDSEligible1', 'SellingLocation_lon', 'VIN_fracReturnedasm1DivTransactions', 'pca_comp_37', 'pca_comp_52', 'pca_comp_13', 'pca_comp_10', 'pca_comp_25', 'pca_comp_48', 'pca_comp_02', 'BuyerID_fracReturnedasm1DivTransactions', 'CarYear', 'pca_comp_44', 'SellerID_fracReturnedNotNullDivDSEligible1', 'pca_comp_51', 'pca_comp_50', 'pca_comp_28', 'BuyerID_numTransactions', 'BuyerID_numDSEligible1', 'pca_comp_49', 'pca_comp_60', 'PSI', 'BuyerID_fracReturned1DivReturnedNotNull', 'pca_comp_36', 'pca_comp_47', 'pca_comp_05', 'pca_comp_53', 'JDPowersCat_fracReturnedNotNullDivDSEligible1', 'pca_comp_56', 'pca_comp_57', 'pca_comp_03', 'pca_comp_09', 'pca_comp_42', 'pca_comp_26', 'pca_comp_00', 'pca_comp_27', 'InLane', 'pca_comp_17', 'pca_comp_68', 'SellingLocation_numTransactions', 'pca_comp_34', 'pca_comp_38', 'CarMake_numReturnedasm1', 'pca_comp_40', 'pca_comp_30', 'LIGHTY', 'SellerID_fracReturnedasm1DivTransactions', 'SellingLocation_numDSEligible1', 'SellerID_fracReturned1DivReturnedNotNull', 'pca_comp_59', 'VIN_numTransactions', 'pca_comp_29', 'pca_comp_61', 'pca_comp_74', 'pca_comp_76', 'pca_comp_18', 'pca_comp_62', 'JDPowersCat_fracReturned1DivReturnedNotNull', 'CarMake_numReturnedNotNull', 'CarMake_fracReturnedasm1DivTransactions', 'BuyerID_numReturnedNotNull', 'BuyerID_numReturned1', 'MMR', 'pca_comp_16', 'JDPowersCat_COMPACTCAR', 'pca_comp_77', 'SellingLocation_numReturned1', 'CarMake_numDSEligible1', 'SellingLocation_numReturnedasm1', 'pca_comp_23', 'CarMake_numTransactions', 'pca_comp_12', 'pca_comp_35', 'SaleDate_dow', 'pca_comp_46', 'CarMake_numReturned1', 'pca_comp_33', 'SellingLocation_lat', 'SellerID_numTransactions', 'LIGHTR', 'Salvage', 'pca_comp_58', 'SellerID_numReturned1', 'pca_comp_45', 'pca_comp_32', 'SellingLocation_fracReturnedNotNullDivDSEligible1', 'pca_comp_22', 'pca_comp_41', 'CarMake_fracReturned1DivReturnedNotNull', 'pca_comp_55', 'pca_comp_08', 'pca_comp_14', 'SellingLocation_fracReturned1DivReturnedNotNull', 'JDPowersCat_fracReturnedasm1DivTransactions', 'SellingLocation_numReturnedNotNull', 'SellingLocation_fracDSEligible1DivTransactions', 'pca_comp_20', 'pca_comp_63', 'pca_comp_75', 'JDPowersCat_numReturned1', 'pca_comp_73', 'SalePrice', 'pca_comp_54', 'VIN_fracDSEligible1DivTransactions', 'pca_comp_01', 'pca_cluster']
Summarize top 5 important features:
       VIN_fracReturnedNotNullDivDSEligible1  VIN_numReturnedNotNull  \
count                            6082.000000             6082.000000   
mean                                0.181538                0.191878   
std                                 0.380963                0.402483   
min                                 0.000000                0.000000   
25%                                 0.000000                0.000000   
50%                                 0.000000                0.000000   
75%                                 0.000000                0.000000   
max                                 1.000000                3.000000   

       Returned_asm  VIN_fracReturned1DivReturnedNotNull   pca_comp_71  
count   6082.000000                          6082.000000  6.082000e+03  
mean       0.084512                             0.031020  1.316872e-17  
std        0.278177                             0.172569  1.295064e-02  
min        0.000000                             0.000000 -1.915956e-01  
25%        0.000000                             0.000000 -1.696117e-03  
50%        0.000000                             0.000000 -1.835708e-04  
75%        0.000000                             0.000000  1.324500e-03  
max        1.000000                             1.000000  2.197499e-01  
Progress: 20% 40% 60% 80% 100% 

Model score = 0.987
`features.pkl`, `estimator.pkl`: Save features and estimator.
`transformer_scaler`, `transformer_pca`: Load existing transformers.
`transformer_kmeans.pkl`, `transformer_kmeans_pca.pkl`: Load transformers.
`features.pkl`, `estimator.pkl`: Save features and estimator.
Model score = 0.962
Chunk:
Evaluation time span: 2013-12-22 00:00:00 2013-12-28 00:00:00
Model return rate:    0.0
Original return rate: 0.142857142857
Overall:
Evaluation time span: 2013-02-03 00:00:00 2013-12-28 00:00:00
Model return rate:    0.12117910191
Original return rate: 0.212037179685
In [9]:
# Overall return rate is Returned==1 / (Returned not null)
print("Plot performance of model")
path_plot_dir = os.path.join(path_data_dir, 'plot_performance_model')

# Plot overall return rate.
(xvals_orig, yvals_orig) = zip(*[(key[1], val) for (key, val) in sorted(retrates_orig_all.items())])
(xvals_modl, yvals_modl) = zip(*[(key[1], val) for (key, val) in sorted(retrates_modl_all.items())])
(xvals_diff, yvals_diff) = (xvals_orig, np.subtract(yvals_modl, yvals_orig))
plt.plot(
    xvals_orig, yvals_orig, marker='.', linestyle='-',
    color=sns.color_palette()[0], label='original return rate')
plt.plot(
    xvals_modl, yvals_modl, marker='.', linestyle='-',
    color=sns.color_palette()[1], label='model return rate')
plt.plot(
    xvals_diff, yvals_diff, marker='.', linestyle='--',
    color=sns.color_palette()[3], label='diff model-original')
plt.title("Return rates vs SaleDate")
plt.xlabel("SaleDate")
plt.ylabel("Overall return rate")
plt.legend(loc='upper left')
plt.tight_layout()
plt.savefig(
    os.path.join(path_plot_dir, 'perf-modl_returnrate_vs_saledate.png'),
    dpi=300)
plt.show()

# Plot return rate by buyer.
# Note: to get buyerid with col max: df_orig.loc[df_orig[col].argmax(), 'BuyerID']
buyerids = [
    '272356', # buyer with max BuyerID_numReturnedNotNull (max accepted DealShield)
    '328701', # buyer with max BuyerID_numReturned1 (max returns)
    '179863', # buyer with max BuyerID_fracReturned1DivReturnedNotNull (max return rate) and frequenly prohibited (36 weeks)
    '46857', # buyers frequently prohibited (38 weeks)
    '62851', # buyers frequently prohibited (36 weeks)
    '16640', # buyers frequently prohibited (36 weeks)
    '61773', # buyers frequently prohibited (35 weeks)
    '20718', # buyers frequently prohibited (35 weeks)
    '18584', # buyers frequently prohibited (34 weeks)
    '248009'] # buyers frequently prohibited (34 weeks)

for buyerid in buyerids:
    print('#'*40)
    print('BuyerID:', buyerid)

    fig = plt.figure()
    ax0 = fig.add_subplot(111)
    ax0.set_title(textwrap.dedent("""\
        Returns and return rates vs num transactions
        for BuyerID={buyerid}""".format(buyerid=buyerid)))
    ax0.set_xlabel('BuyerID_numTransactions')

    ax0.set_ylabel('BuyerID_numReturned1')
    lns0 = list()
    xvals_orig = df_orig.loc[df_orig['BuyerID']==buyerid, 'BuyerID_numTransactions'].values
    yvals0_orig = df_orig.loc[df_orig['BuyerID']==buyerid, 'BuyerID_numReturned1'].values
    lns0 += ax0.plot(
        xvals_orig, yvals0_orig, marker='.', linestyle='-', 
        color=sns.color_palette(palette='dark')[0], label='original BuyerID_numReturned1')
    xvals_modl = df_modl.loc[df_modl['BuyerID']==buyerid, 'BuyerID_numTransactions'].values
    yvals0_modl = df_modl.loc[df_modl['BuyerID']==buyerid, 'BuyerID_numReturned1'].values
    lns0 += ax0.plot(
        xvals_modl, yvals0_modl, marker='.', linestyle='-', 
        color=sns.color_palette(palette=None)[0], label='model BuyerID_numReturned1')
    ylim0 = (
        min(min(yvals0_orig), min(yvals0_modl)),
        max(max(max(yvals0_orig), max(yvals0_modl)), 1))
    ax0.set_ylim(ylim0)
    nticks = 6
    ax0.set_yticks(np.linspace(start=ylim0[0], stop=ylim0[1], num=nticks, endpoint=True))
    ax0.legend(lns0, [ln.get_label() for ln in lns0], loc='upper left')

    ax1 = ax0.twinx()
    ax1.set_ylabel(buyer_retrate)
    lns1 = list()
    yvals1_orig = df_orig.loc[df_orig['BuyerID']==buyerid, buyer_retrate].values
    lns1 += ax1.plot(
        xvals_orig, yvals1_orig, marker='.', linestyle='-', 
        color=sns.color_palette(palette='dark')[1], label='original '+buyer_retrate)
    yvals1_modl = df_modl.loc[df_modl['BuyerID']==buyerid, buyer_retrate].values
    lns1 += ax1.plot(
        xvals_modl, yvals1_modl, marker='.', linestyle='-', 
        color=sns.color_palette(palette=None)[1], label='model '+buyer_retrate)
    ylim1 = (0, 1)
    ax1.set_ylim(ylim1)
    ax1.set_yticks(np.linspace(start=ylim1[0], stop=ylim1[1], num=nticks, endpoint=True))
    ax1.legend(lns1, [ln.get_label() for ln in lns1], loc='lower right')
    plt.tight_layout()
    plt.savefig(
        os.path.join(path_plot_dir, 'perf-modl_returnrate_vs_transactions_for_'+buyerid+'.png'),
        dpi=300)
    plt.show(fig)
Plot performance of model
########################################
BuyerID: 272356
########################################
BuyerID: 328701
########################################
BuyerID: 179863
########################################
BuyerID: 46857
########################################
BuyerID: 62851
########################################
BuyerID: 16640
########################################
BuyerID: 61773
########################################
BuyerID: 20718
########################################
BuyerID: 18584
########################################
BuyerID: 248009
In [8]:
print
Out[8]:
<function print>
In [ ]:
del df_chunk, df_eval, df_modl, df_orig, df_test, df_train
gc.collect()